def santander_transaction_classification( output, project, ): tf_server_name = 'kfdemo-service' if platform != 'GCP': vop = dsl.VolumeOp(name="create_pvc", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") checkout = dsl.ContainerOp( name="checkout", image="alpine/git:latest", command=[ "git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines" ], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output)) checkout.after(vop) if platform == 'GCP': deploy = kubeflow_deploy_op(model_dir=str( 'gs://kubeflow-pipelines-demo/tfx/0b22081a-ed94-11e9-81fb-42010a800160/santander-customer-transaction-prediction-95qxr-268134926/data' ) + '/export/export', server_name=tf_server_name) else: deploy = kubeflow_deploy_op( cluster_name=project, model_dir=str( 'gs://kubeflow-pipelines-demo/tfx/0b22081a-ed94-11e9-81fb-42010a800160/santander-customer-transaction-prediction-95qxr-268134926/data' ) + '/export/export', pvc_name=vop.outputs["name"], server_name=tf_server_name) webapp = dsl.ContainerOp( name='webapp', image='us.gcr.io/kf-pipelines/ml-pipeline-webapp-launcher:v0.3', arguments=["--model_name", 'santanderapp']) webapp.after(deploy) steps = [deploy, webapp] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply( onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def notebook_pipeline(): """A pipeline to run a Jupyter notebook with elyra-ai/kfp-notebook and Papermill.""" from kfp_notebook.pipeline import NotebookOp notebook_op = NotebookOp(name="${name}", notebook="${notebook}", cos_endpoint="${cos_endpoint}", cos_bucket="${cos_bucket}", cos_directory="${cos_directory}", cos_dependencies_archive="${cos_dependencies_archive}", requirements_url="${requirements_url}", image="${image}") from kubernetes.client.models import V1EnvVar notebook_op.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value="${cos_username}")) notebook_op.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value="${cos_password}")) from kfp import onprem notebook_op.container.add_env_variable(V1EnvVar(name='DATA_DIR', value="${mount_path}")) notebook_op.apply(onprem.mount_pvc(pvc_name='${dataset_pvc}', volume_name='${dataset_pvc}', volume_mount_path='${mount_path}'))
def fuel_pipeline(bucket_name='gs://your-bucket/export', input_file='folder/file', output_folder='output folder', epochs=10): preprocess = dsl.ContainerOp(name='preprocess', image='gcr.io/kb-poc-262417/fuel:latest', arguments=[ '--input_file', input_file, '--output_folder', output_folder, '--bucket_name', bucket_name ]) train = dsl.ContainerOp( name='train', image='gcr.io/kb-poc-262417/fuel/train:latest', arguments=['--bucket_name', bucket_name, '--epochs', epochs]) train.after(preprocess) serve = dsl.ContainerOp(name='serve', image='gcr.io/kb-poc-262417/fuel/serve:latest', arguments=['--bucket_name', bucket_name]) serve.after(train) steps = [preprocess, train, serve] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def pipeline(gs_bucket='gs://your-bucket/export', input_file_with_folder='input/churn.csv', output_folder = 'output', optimizer_name='RMSProp', learning_rate=0.003, momentum=0.01, model_dir='gs://your-bucket/export', model_name='dummy', server_name='dummy'): preprocess_args = [ '--bucket_name', gs_bucket, '--input_file_with_folder', input_file_with_folder, '--output', output_folder ] preprocess = dsl.ContainerOp( name='preprocess', image='gcr.io/kube-2020/churn/preprocess:latest', arguments= preprocess_args ) train_args = [ '--bucket_name', gs_bucket, '--output_folder', output_folder, '--optimizer_name', optimizer_name, '--learning_rate', learning_rate, '--momentum' , momentum ] train = dsl.ContainerOp( name='train', image='gcr.io/kube-2020/churn/train:latest', arguments= train_args ) serve_args = [ '--model_path', model_dir, '--model_name', model_name, '--server_name', server_name ] serve = dsl.ContainerOp( name='serve', image='gcr.io/kube-2020/churn/pipeline/deployer:latest', arguments=serve_args ) steps = [preprocess, train, serve] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt')) train.after(preprocess) serve.after(train)
def mnist_pipeline(model_export_dir='gs://your-bucket/export', train_steps='200', learning_rate='0.01', batch_size='100', pvc_name=''): """ Pipeline with three stages: 1. train an MNIST classifier 2. deploy a tf-serving instance to the cluster 3. deploy a web-ui to interact with it """ train = dsl.ContainerOp( name='train', image= 'gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b', arguments=[ "/opt/model.py", "--tf-export-dir", model_export_dir, "--tf-train-steps", train_steps, "--tf-batch-size", batch_size, "--tf-learning-rate", learning_rate ]) serve_args = [ '--model-export-path', model_export_dir, '--server-name', "mnist-service" ] if platform != 'GCP': serve_args.extend( ['--cluster-name', "mnist-pipeline", '--pvc-name', pvc_name]) serve = dsl.ContainerOp( name='serve', image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:' 'e9b96de317989a9673ef88d88fb9dab9dac3005f', arguments=serve_args) serve.after(train) web_ui = dsl.ContainerOp( name='web-ui', image='brightfly/kubeflow-deploy-service:handson', arguments=[ '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui', '--container-port', '5000', '--service-port', '80', '--service-type', "LoadBalancer", '--cluster-name', "mnist-pipeline" ]) web_ui.after(serve) steps = [train, serve, web_ui] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def train_pipeline(output="/mnt/model.h5", result="/mnt/results.txt", pvc_name="train-vol", pvc_path="/mnt", epochs=30, validations=10, trainset='/cut', testset='/cut', input='/train.csv', filenames='id', target='has_scratch', train_size=0.8, learn_rate=0.0001, workers=2): train = train_op(epochs, validations, workers, pvc_path, trainset, input, filenames, target, train_size, learn_rate, output).apply( onprem.mount_pvc("train-vol", 'local-storage', "/mnt")) load = load_op(workers, pvc_path, testset, input, filenames, target, train.outputs['output'], result).apply( onprem.mount_pvc("train-vol", 'local-storage', "/mnt"))
def mnist_pipeline(model_export_dir='gs://kf-test1234/export', project='<your project id>', bucket_name='kf-test1234', n_class='10', pvc_name=''): test = _test(project, bucket_name, n_class, model_export_dir) steps = [test] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def email_pipeline( server_secret="server-secret", subject="Hi, again!", body="Tekton email", sender="*****@*****.**", recipients="[email protected], [email protected]", attachment_filepath="/tmp/data/output.txt" ): email = email_op(server_secret=server_secret, subject=subject, body=body, sender=sender, recipients=recipients, attachment_path=attachment_filepath) email.add_env_variable(env_from_secret('USER', '$(params.server_secret)', 'user')) email.add_env_variable(env_from_secret('PASSWORD', '$(params.server_secret)', 'password')) email.add_env_variable(env_from_secret('TLS', '$(params.server_secret)', 'tls')) email.add_env_variable(env_from_secret('SERVER', '$(params.server_secret)', 'url')) email.add_env_variable(env_from_secret('PORT', '$(params.server_secret)', 'port')) email.apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path)) with dsl.ExitHandler(email): write_file_task = write_file(attachment_filepath).apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path))
def pipeline(dataset_location='/mnt/data/manipulated_fashion_mnist.csv', test_size=0.3, random_state=42, input_shape_height=28, input_shape_width=28, use_pretrained_model='False', model_units_num=128, model_outputs_num=10, model_activation_func_layer2='relu', model_activation_func_layer3='softmax', optimizer='adam', loss='binary_crossentropy', metrics='accuracy', num_epochs=10, location_prepared_dataset='/mnt/data/prep_fashion_mnist.csv', location_improved_dataset='/mnt/data/impr_fasion_mnist.csv', location_training_images='/mnt/data/train_img.csv', location_training_labels='/mnt/data/train_labels.csv', location_test_images='/mnt/data/test_img.csv', location_test_labels='/mnt/data/test_labels.csv', location_base_model='/mnt/model/base_model.h5', location_trained_model='/mnt/model/trained_model.h5'): data_preparation = data_prep_op(dataset_location, location_prepared_dataset).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) feature_engineering = feature_eng_op(data_preparation.outputs['output'], location_improved_dataset).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) data_split = data_split_op(feature_engineering.outputs['output'], test_size, random_state, location_training_images, location_training_labels, location_test_images, location_test_labels).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) with dsl.Condition(use_pretrained_model == 'True'): model_building = model_download_op(input_shape_height, input_shape_width, location_base_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) model_training = model_train_op(data_split.outputs['train_img'], data_split.outputs['train_label'], input_shape_height, input_shape_width, model_building.outputs['output_model_loc'], num_epochs, location_trained_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) model_evaluation = model_eval_op(data_split.outputs['test_img'], data_split.outputs['test_label'], input_shape_height, input_shape_width, model_training.outputs['output_model_loc'], '/mlpipeline-ui-metadata.json').apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) with dsl.Condition(use_pretrained_model == 'False'): model_building = model_build_op(input_shape_height, input_shape_width, model_units_num, model_outputs_num, model_activation_func_layer2, model_activation_func_layer3, optimizer, loss, metrics, location_base_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) model_training = model_train_op(data_split.outputs['train_img'], data_split.outputs['train_label'], input_shape_height, input_shape_width, model_building.outputs['output_model_loc'], num_epochs, location_trained_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt")) model_evaluation = model_eval_op(data_split.outputs['test_img'], data_split.outputs['test_label'], input_shape_height, input_shape_width, model_training.outputs['output_model_loc'], '/mlpipeline-ui-metadata.json').apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
def testVolumeMountingPipelineOperatorFuncs(self): mount_volume_op = onprem.mount_pvc('my-persistent-volume-claim', 'my-volume-name', '/mnt/volume-mount-path') config = kubeflow_dag_runner.KubeflowDagRunnerConfig( pipeline_operator_funcs=[mount_volume_op]) kubeflow_dag_runner.KubeflowDagRunner(config=config).run( _two_step_pipeline()) file_path = 'two_step_pipeline.tar.gz' self.assertTrue(fileio.exists(file_path)) with tarfile.TarFile.open(file_path).extractfile( 'pipeline.yaml') as pipeline_file: self.assertIsNotNone(pipeline_file) pipeline = yaml.safe_load(pipeline_file) container_templates = [ c for c in pipeline['spec']['templates'] if 'container' in c ] self.assertEqual(2, len(container_templates)) volumes = [{ 'name': 'my-volume-name', 'persistentVolumeClaim': { 'claimName': 'my-persistent-volume-claim' } }] # Check that the PVC is specified for kfp<=0.1.31.1. if 'volumes' in pipeline['spec']: self.assertEqual(volumes, pipeline['spec']['volumes']) for template in container_templates: # Check that each container has the volume mounted. self.assertEqual([{ 'name': 'my-volume-name', 'mountPath': '/mnt/volume-mount-path' }], template['container']['volumeMounts']) # Check that each template has the PVC specified for kfp>=0.1.31.2. if 'volumes' in template: self.assertEqual(volumes, template['volumes'])
def body_parts_pipeline(model_dir='gs://your-bucket/export', model_name='dummy', server_name='dummy'): serve_args = [ '--model_path', model_dir, '--model_name', model_name, '--server_name', server_name ] serve = dsl.ContainerOp(name='serve', image='gcr.io/bigdata-2020/dlaas/body:latest', arguments=serve_args) steps = [serve] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def testVolumeMountingPipelineOperatorFuncs(self): mount_volume_op = onprem.mount_pvc('my-persistent-volume-claim', 'my-volume-name', '/mnt/volume-mount-path') config = kubeflow_dag_runner.KubeflowDagRunnerConfig( pipeline_operator_funcs=[mount_volume_op]) kubeflow_dag_runner.KubeflowDagRunner(config=config).run( _two_step_pipeline()) file_path = os.path.join(self.test_dir, 'two_step_pipeline.tar.gz') self.assertTrue(tf.gfile.Exists(file_path)) with tarfile.TarFile.open(file_path).extractfile( 'pipeline.yaml') as pipeline_file: self.assertIsNotNone(pipeline_file) pipeline = yaml.load(pipeline_file) containers = [ c for c in pipeline['spec']['templates'] if 'container' in c ] self.assertEqual(2, len(containers)) # Check that each container has the volume mounted. self.assertEqual([{ 'name': 'my-volume-name', 'mountPath': '/mnt/volume-mount-path' }], containers[0]['container']['volumeMounts']) self.assertEqual([{ 'name': 'my-volume-name', 'mountPath': '/mnt/volume-mount-path' }], containers[1]['container']['volumeMounts']) # Check that the PVC is specified. self.assertEqual([{ 'name': 'my-volume-name', 'persistentVolumeClaim': { 'claimName': 'my-persistent-volume-claim' } }], pipeline['spec']['volumes'])
def mnist_pipeline( experiment_name = 'mnist', namespace = 'kubeflow', gs_bucket='gs://your-bucket/export', epochs=10, batch_size=128, model_dir='gs://your-bucket/export', model_name='dummy', server_name='dummy'): train_args = [ '--bucket_name', gs_bucket, '--epochs', epochs, ] convert_op1 = func_to_container_op(step1) op1 = convert_op1() convert_op = func_to_container_op(convert_mnist_experiment_result) op2 = convert_op(op1.output, gs_bucket, epochs) train = dsl.ContainerOp( name='train', image='gcr.io/dais-data-dev-txwj/mnist/train:latest', arguments=op2.output ) steps = [op1, op2, train] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt')) op2.after(op1) train.after(op2)
def main(unused_argv): serving_model_dir = os.path.join(FLAGS.project_root, 'serving_model', FLAGS.pipeline_name) module_file = os.path.join(FLAGS.project_root, 'titanic_keras_utils.py') # Root directory to store pipeline artifacts. pipeline_root = os.path.join(FLAGS.project_root, 'pipeline') metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, # Specify custom docker image to use. tfx_image=tfx_image, pipeline_operator_funcs=( # If running on K8s Engine (GKE) on Google Cloud Platform (GCP), # kubeflow_dag_runner.get_default_pipeline_operator_funcs() provides # default configurations specifically for GKE on GCP, such as secrets. [ onprem.mount_pvc(_persistent_volume_claim, _persistent_volume, _persistent_volume_mount) ])) kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( create_tfx_pipeline( pipeline_name=FLAGS.pipeline_name, pipeline_root=pipeline_root, data_root=FLAGS.data_root, module_file=module_file, serving_model_dir=serving_model_dir, # 0 means auto-detect based on on the number of CPUs available during # execution time. direct_num_workers=0))
def mnist_pipeline(gs_bucket='gs://your-bucket/export', epochs=10, batch_size=128, model_dir='gs://your-bucket/export', model_name='dummy', server_name='dummy'): train_args = [ '--bucket_name', gs_bucket, '--epochs', epochs, '--batch_size', batch_size ] train = dsl.ContainerOp( name='train', image='gcr.io/kb-poc-262417/mnist/train:latest', arguments= train_args ) serve_args = [ '--model_path', model_dir, '--model_name', model_name, '--server_name', server_name ] serve = dsl.ContainerOp( name='serve', image='gcr.io/kb-poc-262417/mnist/pipeline/deployer:latest', arguments=serve_args ) steps = [train, serve] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt')) serve.after(train)
def mnist_train_pipeline(model_export_dir='gs://your-bucket/export', train_steps='200', learning_rate='0.01', batch_size='100', pvc_name=''): """ Pipeline with three stages: 1. train an MNIST classifier 2. deploy a tf-serving instance to the cluster 3. deploy a web-ui to interact with it """ train = dsl.ContainerOp(name='train', image='TRAIN_IMG', arguments=[ "/opt/model.py", "--tf-export-dir", model_export_dir, "--tf-train-steps", train_steps, "--tf-batch-size", batch_size, "--tf-learning-rate", learning_rate ]) push = dsl.ContainerOp(name='push', image='PUSH_IMG', arguments=[ "/opt/entrypoint.sh", "PUSH_REPO", "PUSH_LOGIN", "PUSH_PASS", "PUSH_SHA", ]) push.after(train) steps = [train, push] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def ai_training_run( # Define variables that the user can set in the pipelines UI; set default values dataset_volume_pvc_existing: str = "dataset-vol", trained_model_volume_pvc_existing: str = "kfp-model-vol", execute_data_prep_step__yes_or_no: str = "yes", data_prep_step_container_image: str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3", data_prep_step_command: str = "<insert command here>", data_prep_step_dataset_volume_mountpoint: str = "/mnt/dataset", train_step_container_image: str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3", train_step_command: str = "<insert command here>", train_step_dataset_volume_mountpoint: str = "/mnt/dataset", train_step_model_volume_mountpoint: str = "/mnt/model", validation_step_container_image: str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3", validation_step_command: str = "<insert command here>", validation_step_dataset_volume_mountpoint: str = "/mnt/dataset", validation_step_model_volume_mountpoint: str = "/mnt/model"): # Set GPU limits; Due to SDK limitations, this must be hardcoded train_step_num_gpu = 0 validation_step_num_gpu = 0 # Pipeline Steps: # Execute data prep step with dsl.Condition(execute_data_prep_step__yes_or_no == "yes"): data_prep = dsl.ContainerOp(name="data-prep", image=data_prep_step_container_image, command=["sh", "-c"], arguments=[data_prep_step_command]) # Mount dataset volume/pvc data_prep.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'dataset', data_prep_step_dataset_volume_mountpoint)) # Create a snapshot of the dataset volume/pvc for traceability volume_snapshot_name = "dataset-{{workflow.uid}}" dataset_snapshot = dsl.ContainerOp( name="dataset-snapshot", image="python:3", command=["/bin/bash", "-c"], arguments=[ "\ python3 -m pip install netapp-dataops-k8s && \ echo '" + volume_snapshot_name + "' > /volume_snapshot_name.txt && \ netapp_dataops_k8s_cli.py create volume-snapshot --pvc-name=" + str(dataset_volume_pvc_existing) + " --snapshot-name=" + str(volume_snapshot_name) + " --namespace={{workflow.namespace}}" ], file_outputs={"volume_snapshot_name": "/volume_snapshot_name.txt"}) # State that snapshot should be created after the data prep job completes dataset_snapshot.after(data_prep) # Execute training step train = dsl.ContainerOp(name="train-model", image=train_step_container_image, command=["sh", "-c"], arguments=[train_step_command]) # Mount dataset volume/pvc train.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', train_step_dataset_volume_mountpoint)) # Mount model volume/pvc train.apply( onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', train_step_model_volume_mountpoint)) # Request that GPUs be allocated to training job pod if train_step_num_gpu > 0: train.set_gpu_limit(train_step_num_gpu, 'nvidia') # State that training job should be executed after dataset volume snapshot is taken train.after(dataset_snapshot) # Create a snapshot of the model volume/pvc for model versioning volume_snapshot_name = "kfp-model-{{workflow.uid}}" model_snapshot = dsl.ContainerOp( name="model-snapshot", image="python:3", command=["/bin/bash", "-c"], arguments=[ "\ python3 -m pip install netapp-dataops-k8s && \ echo '" + volume_snapshot_name + "' > /volume_snapshot_name.txt && \ netapp_dataops_k8s_cli.py create volume-snapshot --pvc-name=" + str(trained_model_volume_pvc_existing) + " --snapshot-name=" + str(volume_snapshot_name) + " --namespace={{workflow.namespace}}" ], file_outputs={"volume_snapshot_name": "/volume_snapshot_name.txt"}) # State that snapshot should be created after the training job completes model_snapshot.after(train) # Execute inference validation job inference_validation = dsl.ContainerOp( name="validate-model", image=validation_step_container_image, command=["sh", "-c"], arguments=[validation_step_command]) # Mount dataset volume/pvc inference_validation.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', validation_step_dataset_volume_mountpoint)) # Mount model volume/pvc inference_validation.apply( onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', validation_step_model_volume_mountpoint)) # Request that GPUs be allocated to pod if validation_step_num_gpu > 0: inference_validation.set_gpu_limit(validation_step_num_gpu, 'nvidia') # State that inference validation job should be executed after model volume snapshot is taken inference_validation.after(model_snapshot)
def mnist_pipeline(): ENV_MANAGE_URL = V1EnvVar(name='MANAGE_URL', value='http://220.116.228.93:8088/send') data_0 = dsl.ContainerOp( name="load & preprocess data pipeline", image="byeongjokim/mnist-pre-data:latest", ).set_display_name('collect & preprocess data')\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data")) data_1 = dsl.ContainerOp( name="validate data pipeline", image="byeongjokim/mnist-val-data:latest", ).set_display_name('validate data').after(data_0)\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data")) train_model = dsl.ContainerOp( name="train embedding model", image="byeongjokim/mnist-train-model:latest", ).set_display_name('train model').after(data_1)\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\ .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model")) embedding = dsl.ContainerOp( name="embedding data using embedding model", image="byeongjokim/mnist-embedding:latest", ).set_display_name('embedding').after(train_model)\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\ .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model")) train_faiss = dsl.ContainerOp( name="train faiss", image="byeongjokim/mnist-train-faiss:latest", ).set_display_name('train faiss').after(embedding)\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\ .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model")) analysis = dsl.ContainerOp( name="analysis total", image="byeongjokim/mnist-analysis:latest", file_outputs={ "confusion_matrix": "/confusion_matrix.csv", "mlpipeline-ui-metadata": "/mlpipeline-ui-metadata.json", "accuracy": "/accuracy.json", "mlpipeline_metrics": "/mlpipeline-metrics.json" } ).add_env_variable(ENV_MANAGE_URL).set_display_name('analysis').after(train_faiss)\ .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\ .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model")) baseline = 0.8 with dsl.Condition(analysis.outputs["accuracy"] > baseline) as check_deploy: deploy = dsl.ContainerOp( name="deploy mar", image="byeongjokim/mnist-deploy:latest", ).add_env_variable(ENV_MANAGE_URL).set_display_name('deploy').after(analysis)\ .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))\ .apply(onprem.mount_pvc("deploy-model-pvc", volume_name="deploy-model", volume_mount_path="/deploy-model"))
def santander_transaction_classification( output, project, train='gs://kubeflow-pipelines-demo/dataset/train.csv', evaluation='gs://kubeflow-pipelines-demo/dataset/test.csv', mode='local', preprocess_module='gs://kubeflow-pipelines-demo/dataset/preprocessing.py', learning_rate=0.1, hidden_layer_size='1500', steps=3000): output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data' target_class_lambda = """lambda x: x['target']""" tf_server_name = 'kfdemo-service' if platform != 'GCP': vop = dsl.VolumeOp(name="create_pvc", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") checkout = dsl.ContainerOp( name="checkout", image="alpine/git:latest", command=[ "git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines" ], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output)) checkout.after(vop) preprocess = dataflow_tf_transform_op( training_data_file_pattern=train, evaluation_data_file_pattern=evaluation, schema="not.txt", gcp_project=project, run_mode=mode, preprocessing_module=preprocess_module, transformed_data_dir=output_template) training = tf_train_op(transformed_data_dir=preprocess.output, schema='not.txt', learning_rate=learning_rate, hidden_layer_size=hidden_layer_size, steps=steps, target='tips', preprocessing_module=preprocess_module, training_output_dir=output_template) prediction = dataflow_tf_predict_op( data_file_pattern=evaluation, schema='not.txt', target_column='tips', model=training.outputs['training_output_dir'], run_mode=mode, gcp_project=project, predictions_dir=output_template) cm = confusion_matrix_op(predictions=prediction.outputs['predictions_dir'], output_dir=output_template) roc = roc_op(predictions_dir=prediction.outputs['predictions_dir'], target_lambda=target_class_lambda, output_dir=output_template) steps = [training, prediction, cm, roc] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply( onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def taxi_cab_classification( project, output='s3://mlpipeline/tfx/output', column_names='s3://mlpipeline/tfx/taxi-cab-classification/column-names.json', key_columns='trip_start_timestamp', train='s3://mlpipeline/tfx/taxi-cab-classification/train.csv', evaluation='s3://mlpipeline/tfx/taxi-cab-classification/eval.csv', mode='local', preprocess_module='s3://mlpipeline/tfx/taxi-cab-classification/preprocessing.py', learning_rate=0.1, hidden_layer_size='1500', steps=3000, analyze_slice_column='trip_start_hour' ): output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data' target_lambda = """lambda x: (x['target'] > x['fare'] * 0.2)""" target_class_lambda = """lambda x: 1 if (x['target'] > x['fare'] * 0.2) else 0""" tf_server_name = 'taxi-cab-classification-model-{{workflow.uid}}' if platform == 'onprem': if storage == 'minio': data_preparation = dsl.ContainerOp( name="data_preparation", image="aiven86/minio_mc-git", command=["sh", "/bin/run.sh"], ).set_image_pull_policy('IfNotPresent') data_preparation.container.add_env_variable(V1EnvVar(name='GITPAHT', value=GITPAHT)) data_preparation.container.add_env_variable(V1EnvVar(name='GITDIR', value=GITDIR)) data_preparation.container.add_env_variable(V1EnvVar(name='MINIOPATH', value=MINIOPATH)) data_preparation.container.add_env_variable(V1EnvVar(name='DATAPATH', value=DATAPATH)) else: vop = dsl.VolumeOp( name="create_pvc", storage_class="rook-ceph-fs", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi" ) data_preparation = dsl.ContainerOp( name="data_preparation", image="aiven86/git", command=["git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines"], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output)) data_preparation.after(vop) validation = dataflow_tf_data_validation_op( inference_data=train, validation_data=evaluation, column_names=column_names, key_columns=key_columns, gcp_project=project, run_mode=mode, validation_output=output_template, ) if platform == 'onprem': validation.after(data_preparation) preprocess = dataflow_tf_transform_op( training_data_file_pattern=train, evaluation_data_file_pattern=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, preprocessing_module=preprocess_module, transformed_data_dir=output_template ) training = tf_train_op( transformed_data_dir=preprocess.output, schema=validation.outputs['schema'], learning_rate=learning_rate, hidden_layer_size=hidden_layer_size, steps=steps, target='tips', preprocessing_module=preprocess_module, training_output_dir=output_template ) analysis = dataflow_tf_model_analyze_op( model=training.output, evaluation_data=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, slice_columns=analyze_slice_column, analysis_results_dir=output_template ) prediction = dataflow_tf_predict_op( data_file_pattern=evaluation, schema=validation.outputs['schema'], target_column='tips', model=training.output, run_mode=mode, gcp_project=project, predictions_dir=output_template ) cm = confusion_matrix_op( predictions=prediction.output, target_lambda=target_lambda, output_dir=output_template ) roc = roc_op( predictions_dir=prediction.output, target_lambda=target_class_lambda, output_dir=output_template ) if platform == 'GCP' or storage == 'minio': deploy = kubeflow_deploy_op( model_dir=str(training.output) + '/export/export', server_name=tf_server_name ) elif platform == 'onprem' and storage != 'minio': deploy = kubeflow_deploy_op( cluster_name=project, model_dir=str(training.output) + '/export/export', pvc_name=vop.outputs["name"], server_name=tf_server_name ) steps = [validation, preprocess, training, analysis, prediction, cm, roc, deploy] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) elif platform == 'onprem': if storage == 'minio': step.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value=AWS_ACCESS_KEY_ID)) step.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value=AWS_SECRET_ACCESS_KEY)) step.container.add_env_variable(V1EnvVar(name='AWS_REGION', value=AWS_REGION)) step.container.add_env_variable(V1EnvVar(name='S3_ENDPOINT', value=S3_ENDPOINT)) step.container.add_env_variable(V1EnvVar(name='S3_USE_HTTPS', value=S3_USE_HTTPS)) step.container.add_env_variable(V1EnvVar(name='S3_VERIFY_SSL', value=S3_VERIFY_SSL)) else: step.apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def mnist_pipeline(experiment_name='mnist', namespace='kubeflow', gs_bucket='gs://your-bucket/export', epochs=10, batch_size=128, model_dir='gs://your-bucket/export', model_name='dummy', server_name='dummy'): objectiveConfig = { "type": "maximize", "goal": 0.85, "objectiveMetricName": "accuracy", } algorithmConfig = {"algorithmName": "random"} parameters = [ { "name": "--learning_rate", "parameterType": "double", "feasibleSpace": { "min": "0.001", "max": "0.003" } }, { "name": "--batch_size", "parameterType": "discrete", "feasibleSpace": { "list": ["512", "1024", "2048"] } }, ] rawTemplate = { "apiVersion": "kubeflow.org/v1", "kind": "TFJob", "metadata": { "name": "{{.Trial}}", "namespace": "{{.NameSpace}}" }, "spec": { "tfReplicaSpecs": { "Worker": { "replicas": 1, "restartPolicy": "OnFailure", "template": { "spec": { "containers": [{ "name": "tensorflow", "image": "gcr.io/dais-data-dev-txwj/mnist/train:latest", "imagePullPolicy": "Always", "command": ["sh", "-c"], "args": [ "python /train.py --epochs 1 --save_model 0 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}" ] }] } } } } } } trialTemplate = {"goTemplate": {"rawTemplate": json.dumps(rawTemplate)}} katib_experiment_launcher_op = components.load_component_from_url( 'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml' ) katibOp = katib_experiment_launcher_op(experiment_name=experiment_name, experiment_namespace=namespace, parallel_trial_count=3, max_trial_count=12, objective=str(objectiveConfig), algorithm=str(algorithmConfig), trial_template=str(trialTemplate), parameters=str(parameters), delete_finished_experiment=False) train_args = [ '--bucket_name', gs_bucket, '--epochs', epochs, ] convert_op = func_to_container_op(convert_mnist_experiment_result) op2 = convert_op(katibOp.output, gs_bucket, epochs) train = dsl.ContainerOp( name='train', image='gcr.io/dais-data-dev-txwj/mnist/train:latest', arguments=op2.output) serve_args = [ '--model_path', model_dir, '--model_name', model_name, '--server_name', server_name ] serve = dsl.ContainerOp( name='serve', image='gcr.io/dais-data-dev-txwj/mnist/serve:latest', arguments=serve_args) steps = [katibOp, op2, train, serve] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt')) op2.after(katibOp) train.after(op2) serve.after(train)
return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'tfx_image': 'tensorflow/tfx:0.14.0rc1' }, log_root='/var/tmp/tfx/logs', ) if __name__ == '__main__': mount_volume_op = onprem.mount_pvc( "tfx-pvc", "tfx-pv", _tfx_root) config = KubeflowDagRunnerConfig( pipeline_operator_funcs=[mount_volume_op] ) _pipeline = _create_pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=os.path.join(_pipeline_root, 'data'), module_file=_module_file, serving_model_dir=_serving_model_dir, ) KubeflowRunner(config=config).run(_pipeline)
eval_steps=100, ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, # Specify custom docker image to use. tfx_image=tfx_image, pipeline_operator_funcs=( # If running on K8s Engine (GKE) on Google Cloud Platform (GCP), # kubeflow_dag_runner.get_default_pipeline_operator_funcs() # provides default configurations specifically for GKE on GCP, # such as secrets. kubeflow_dag_runner.get_default_pipeline_operator_funcs() + [ onprem.mount_pvc( persistent_volume_claim, persistent_volume, persistent_volume_mount, ) ] ), ) p = init_kubeflow_pipeline(components, output_base, direct_num_workers=0) output_filename = f"{pipeline_name}.yaml" kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, output_dir=output_dir, output_filename=output_filename, ).run(p)
value='https://s3.us.cloud-object-storage.appdomain.cloud'): def _env_params(task): from kubernetes import client as k8s_client return (task.add_env_variable( k8s_client.V1EnvVar(name=name, value=value))) return _env_params s3_endpoint_params = env_params('S3_ENDPOINT', 'minio-service.kubeflow:9000') s3_use_https_params = env_params('S3_USE_HTTPS', '0') s3_verify_ssl_params = env_params('S3_VERIFY_SSL', '0') access_key_id_params = env_params('AWS_ACCESS_KEY_ID', 'minio') secret_access_key_params = env_params('AWS_SECRET_ACCESS_KEY', 'minio123') mount_volume_op = onprem.mount_pvc('tfx-volume', 'shared-volume', _output_bucket) def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, output_bucket: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns:
def mnist_pipeline(model_export_dir='/mnt/export', train_steps='1000', learning_rate='0.01', batch_size='100', output='/mnt'): """ Pipeline with three stages: 1. train an MNIST classifier 2. deploy a tf-serving instance to the cluster 3. deploy a web-ui to interact with it """ if platform != 'GCP': vop = dsl.VolumeOp(name="create_pvc", storage_class="rook-ceph-fs", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") pvc_name = vop.outputs["name"] download = dsl.ContainerOp( name="download_data", image="aiven86/git", command=[ "git", "clone", "https://github.com/cdyangzhenyu/mnist-data.git", str(output) + "/data" ], ).apply(onprem.mount_pvc(pvc_name, 'local-storage', output)) download.after(vop) train = dsl.ContainerOp(name='train', image='aiven86/tensorflow-mnist-kfp:1.13.1-gpu', arguments=[ "/opt/model.py", "--tf-data-dir", str(output) + "/data", "--tf-export-dir", model_export_dir, "--tf-train-steps", train_steps, "--tf-batch-size", batch_size, "--tf-learning-rate", learning_rate ]).add_resource_limit("aliyun.com/gpu-mem", 1) train.after(download) serve_args = [ '--model-export-path', model_export_dir, '--server-name', "mnist-service" ] if platform != 'GCP': serve_args.extend( ['--cluster-name', "mnist-pipeline", '--pvc-name', pvc_name]) serve = dsl.ContainerOp( name='serve', image='aiven86/ml-pipeline_ml-pipeline-kubeflow-deployer:' '7775692adf28d6f79098e76e839986c9ee55dd61', arguments=serve_args) serve.after(train) webui_args = [ '--image', 'aiven86/kubeflow-examples_mnist_web-ui:' 'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui', '--container-port', '5000', '--service-port', '80', '--service-type', "NodePort" ] if platform != 'GCP': webui_args.extend(['--cluster-name', "mnist-pipeline"]) web_ui = dsl.ContainerOp( name='web-ui', image='aiven86/kubeflow-examples_mnist_deploy-service:latest', arguments=webui_args).set_image_pull_policy('IfNotPresent') web_ui.after(serve) steps = [train, serve, web_ui] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', output))
def taxi_cab_classification( pvc_size='1Gi', project='tfx-taxi-pipeline-on-prem', column_names='pipelines/samples/tfx/taxi-cab-classification/column-names.json', key_columns='trip_start_timestamp', train='pipelines/samples/tfx/taxi-cab-classification/train.csv', evaluation='pipelines/samples/tfx/taxi-cab-classification/eval.csv', mode='local', preprocess_module='pipelines/samples/tfx/taxi-cab-classification/preprocessing.py', learning_rate=0.1, hidden_layer_size=1500, steps=3000, analyze_slice_column='trip_start_hour'): tf_server_name = 'taxi-cab-classification-model-{{workflow.name}}' vop = dsl.VolumeOp(name='create-volume', resource_name='taxi-cab-data', modes=dsl.VOLUME_MODE_RWM, size=pvc_size) checkout = dsl.ContainerOp( name="checkout", image="alpine/git:latest", command=[ "git", "clone", "https://github.com/kubeflow/pipelines.git", "/mnt/pipelines" ], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', "/mnt")) checkout.after(vop) validation = dataflow_tf_data_validation_op('/mnt/%s' % train, '/mnt/%s' % evaluation, '/mnt/%s' % column_names, key_columns, project, mode, '/mnt', vop.volume) validation.after(checkout) preprocess = dataflow_tf_transform_op('/mnt/%s' % train, '/mnt/%s' % evaluation, validation.outputs['schema'], project, mode, '/mnt/%s' % preprocess_module, '/mnt', vop.volume) training = tf_train_op(preprocess.output, validation.outputs['schema'], learning_rate, hidden_layer_size, steps, 'tips', '/mnt/%s' % preprocess_module, '/mnt', vop.volume) analysis = dataflow_tf_model_analyze_op( training.output, '/mnt/%s' % evaluation, validation.outputs['schema'], project, mode, analyze_slice_column, '/mnt', vop.volume) prediction = dataflow_tf_predict_op('/mnt/%s' % evaluation, validation.outputs['schema'], 'tips', training.output, mode, project, '/mnt', vop.volume) cm = confusion_matrix_op(prediction.output, '/mnt', vop.volume) roc = roc_op(prediction.output, '/mnt', vop.volume) deploy = kubeflow_deploy_op(training.output, tf_server_name, vop.output, {'/mnt': vop.volume})
) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, # Specify custom docker image to use. tfx_image=tfx_image, pipeline_operator_funcs=( # If running on K8s Engine (GKE) on Google Cloud Platform (GCP), # kubeflow_dag_runner.get_default_pipeline_operator_funcs() provides # default configurations specifically for GKE on GCP, such as secrets. [ onprem.mount_pvc(_persistent_volume_claim, _persistent_volume, _persistent_volume_mount) ])) kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( _create_pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root, module_file=_module_file, serving_model_dir=_serving_model_dir, # 0 means auto-detect based on on the number of CPUs available during # execution time. direct_num_workers=0))
def ai_training_run( # Define variables that the user can set in the pipelines UI; set default values ontap_cluster_mgmt_hostname: str = "10.61.188.40", ontap_cluster_admin_acct_k8s_secret: str = "ontap-cluster-mgmt-account", ontap_api_verify_ssl_cert: bool = True, dataset_volume_pvc_existing: str = "dataset-vol", dataset_volume_pv_existing: str = "pvc-43b12235-f32e-4dc4-a7b8-88e90d935a12", trained_model_volume_pvc_existing: str = "kfp-model-vol", trained_model_volume_pv_existing: str = "pvc-236e893b-63b4-40d3-963b-e709b9b2816b", execute_data_prep_step__yes_or_no: str = "yes", data_prep_step_container_image: str = "ubuntu:bionic", data_prep_step_command: str = "<insert command here>", data_prep_step_dataset_volume_mountpoint: str = "/mnt/dataset", train_step_container_image: str = "nvcr.io/nvidia/tensorflow:19.12-tf1-py3", train_step_command: str = "<insert command here>", train_step_dataset_volume_mountpoint: str = "/mnt/dataset", train_step_model_volume_mountpoint: str = "/mnt/model", validation_step_container_image: str = "nvcr.io/nvidia/tensorflow:19.12-tf1-py3", validation_step_command: str = "<insert command here>", validation_step_dataset_volume_mountpoint: str = "/mnt/dataset", validation_step_model_volume_mountpoint: str = "/mnt/model" ) : # Set GPU limits; Due to SDK limitations, this must be hardcoded train_step_num_gpu = 0 validation_step_num_gpu = 0 # Pipeline Steps: # Execute data prep step with dsl.Condition(execute_data_prep_step__yes_or_no == "yes") : data_prep = dsl.ContainerOp( name="data-prep", image=data_prep_step_container_image, command=["sh", "-c"], arguments=[data_prep_step_command] ) # Mount dataset volume/pvc data_prep.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'dataset', data_prep_step_dataset_volume_mountpoint) ) # Create a snapshot of the dataset volume/pvc for traceability dataset_snapshot = NetappSnapshotOp( ontap_cluster_mgmt_hostname, dataset_volume_pv_existing, ontap_api_verify_ssl_cert ) # Mount k8s secret containing ONTAP cluster admin account details dataset_snapshot.add_pvolumes({ '/mnt/secret': k8s_client.V1Volume( name='ontap-cluster-admin', secret=k8s_client.V1SecretVolumeSource( secret_name=ontap_cluster_admin_acct_k8s_secret ) ) }) # State that snapshot should be created after the data prep job completes dataset_snapshot.after(data_prep) # Execute training step train = dsl.ContainerOp( name="train-model", image=train_step_container_image, command=["sh", "-c"], arguments=[train_step_command] ) # Mount dataset volume/pvc train.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', train_step_dataset_volume_mountpoint) ) # Mount model volume/pvc train.apply( onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', train_step_model_volume_mountpoint) ) # Request that GPUs be allocated to training job pod if train_step_num_gpu > 0 : train.set_gpu_limit(train_step_num_gpu, 'nvidia') # State that training job should be executed after dataset volume snapshot is taken train.after(dataset_snapshot) # Create a snapshot of the model volume/pvc for model versioning model_snapshot = NetappSnapshotOp( ontap_cluster_mgmt_hostname, trained_model_volume_pv_existing, ontap_api_verify_ssl_cert ) # Mount k8s secret containing ONTAP cluster admin account details model_snapshot.add_pvolumes({ '/mnt/secret': k8s_client.V1Volume( name='ontap-cluster-admin', secret=k8s_client.V1SecretVolumeSource( secret_name=ontap_cluster_admin_acct_k8s_secret ) ) }) # State that snapshot should be created after the training job completes model_snapshot.after(train) # Execute inference validation job inference_validation = dsl.ContainerOp( name="validate-model", image=validation_step_container_image, command=["sh", "-c"], arguments=[validation_step_command] ) # Mount dataset volume/pvc inference_validation.apply( onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', validation_step_dataset_volume_mountpoint) ) # Mount model volume/pvc inference_validation.apply( onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', validation_step_model_volume_mountpoint) ) # Request that GPUs be allocated to pod if validation_step_num_gpu > 0 : inference_validation.set_gpu_limit(validation_step_num_gpu, 'nvidia') # State that inference validation job should be executed after model volume snapshot is taken inference_validation.after(model_snapshot)
def taxi_cab_classification( output, project, column_names='gs://ml-pipeline-playground/tfx/taxi-cab-classification/column-names.json', key_columns='trip_start_timestamp', train='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv', evaluation='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv', mode='local', preprocess_module='gs://ml-pipeline-playground/tfx/taxi-cab-classification/preprocessing.py', learning_rate=0.1, hidden_layer_size='1500', steps=3000, analyze_slice_column='trip_start_hour'): output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data' target_lambda = """lambda x: (x['target'] > x['fare'] * 0.2)""" target_class_lambda = """lambda x: 1 if (x['target'] > x['fare'] * 0.2) else 0""" tf_server_name = 'taxi-cab-classification-model-{{workflow.uid}}' if platform != 'GCP': vop = dsl.VolumeOp(name="create_pvc", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") checkout = dsl.ContainerOp( name="checkout", image="alpine/git:latest", command=[ "git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines" ], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output)) checkout.after(vop) validation = dataflow_tf_data_validation_op( inference_data=train, validation_data=evaluation, column_names=column_names, key_columns=key_columns, gcp_project=project, run_mode=mode, validation_output=output_template, ) if platform != 'GCP': validation.after(checkout) preprocess = dataflow_tf_transform_op( training_data_file_pattern=train, evaluation_data_file_pattern=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, preprocessing_module=preprocess_module, transformed_data_dir=output_template) training = tf_train_op(transformed_data_dir=preprocess.output, schema=validation.outputs['schema'], learning_rate=learning_rate, hidden_layer_size=hidden_layer_size, steps=steps, target='tips', preprocessing_module=preprocess_module, training_output_dir=output_template) analysis = dataflow_tf_model_analyze_op( model=training.output, evaluation_data=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, slice_columns=analyze_slice_column, analysis_results_dir=output_template) prediction = dataflow_tf_predict_op(data_file_pattern=evaluation, schema=validation.outputs['schema'], target_column='tips', model=training.output, run_mode=mode, gcp_project=project, predictions_dir=output_template) cm = confusion_matrix_op(predictions=prediction.output, target_lambda=target_lambda, output_dir=output_template) roc = roc_op(predictions_dir=prediction.output, target_lambda=target_class_lambda, output_dir=output_template) if platform == 'GCP': deploy = kubeflow_deploy_op(model_dir=str(training.output) + '/export/export', server_name=tf_server_name) else: deploy = kubeflow_deploy_op(cluster_name=project, model_dir=str(training.output) + '/export/export', pvc_name=vop.outputs["name"], server_name=tf_server_name) steps = [ validation, preprocess, training, analysis, prediction, cm, roc, deploy ] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply( onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def face_recognition( train_steps='30', learning_rate='-1', batch_size='1000', dataset_dir='/dataset', output_dir='/output', public_ip='10.1.0.15', ): """ Pipeline with three stages: 1. prepare the face recognition align dataset CASIA-WebFace 2. train an facenet classifier model 3. deploy a tf-serving instance to the cluster 4. deploy a web-ui to interact with it """ if platform == 'onprem': data_vop = dsl.VolumeOp(name="prepare_data_vop", storage_class="rook-ceph-fs", resource_name="data-pvc", modes=dsl.VOLUME_MODE_RWM, size="10Gi") data_pvc_name = data_vop.outputs["name"] output_vop = dsl.VolumeOp(name="prepare_output_vop", storage_class="csi-s3", resource_name="output-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") output_vop.after(data_vop) output_pvc_name = output_vop.outputs["name"] casia_align_data = str( dataset_dir) + "/data/casia_maxpy_mtcnnalign_182_160/" if is_aligned == 'True': raw_dataset = dsl.ContainerOp( name="raw_dataset", image="aiven86/facenet-dataset-casia-maxpy-clean:tail-2000", command=[ "/bin/sh", "-c", "echo 'begin moving data';mv /data/ %s/;echo 'moving is finished';" % str(dataset_dir) ], ).apply(onprem.mount_pvc(data_pvc_name, 'dataset-storage', dataset_dir)) raw_dataset.after(output_vop) casia_align_data = str( dataset_dir) + "/data/casia_maxpy_tail_2000_mtcnnalign_182_160" align_dataset_lfw = dsl.ContainerOp( name="align_dataset_lfw", image="aiven86/facenet-tensorflow:1.13.1-gpu-py3", command=[ "/bin/sh", "-c", "python /facenet/src/align/align_dataset_mtcnn.py %s/data/lfw " "%s/data/lfw_mtcnnalign_160 --image_size 160 --margin 32 --random_order --gpu_memory_fraction 0.8" % (str(dataset_dir), str(dataset_dir)) ], ).apply(onprem.mount_pvc(data_pvc_name, 'dataset-storage', dataset_dir)) align_dataset_lfw.container.add_resource_limit("nvidia.com/gpu", 1) align_dataset_lfw.container.add_env_variable( V1EnvVar(name='PYTHONPATH', value=PYTHONPATH)) align_dataset_lfw.after(raw_dataset) align_dataset = dsl.ContainerOp( name="align_dataset", image="aiven86/facenet-tensorflow:1.13.1-gpu-py3", command=[ "/bin/sh", "-c", "python /facenet/src/align/align_dataset_mtcnn.py %s/data/CASIA-maxpy-clean-tail-2000 " "%s --image_size 182 --margin 44 --random_order --gpu_memory_fraction 0.8" % (str(dataset_dir), str(casia_align_data)) ], ).add_resource_limit("nvidia.com/gpu", 1) align_dataset.after(align_dataset_lfw) else: align_dataset = dsl.ContainerOp( name="align_dataset", image="aiven86/facenet-dataset-casia-mtcnnalign:test", command=[ "/bin/sh", "-c", "echo 'begin moving data';mv /data/ %s/;echo 'moving is finished';" % str(dataset_dir) ], ) align_dataset.after(output_vop) train = dsl.ContainerOp( name='train', image='aiven86/facenet-tensorflow:1.13.1-gpu-py3', command=[ "/bin/sh", "-c", "cd /facenet; python src/train_softmax.py --logs_base_dir %s/logs/facenet/ --models_base_dir %s/models/facenet/ " "--data_dir %s --image_size 160 --model_def models.inception_resnet_v1 " "--lfw_dir %s/data/lfw_mtcnnalign_160/ --optimizer ADAM --learning_rate %s --max_nrof_epochs %s --keep_probability 0.8 " "--random_crop --random_flip --use_fixed_image_standardization " "--learning_rate_schedule_file data/learning_rate_schedule_classifier_casia.txt --weight_decay 5e-4 " "--embedding_size 512 --lfw_distance_metric 1 --lfw_use_flipped_images --lfw_subtract_mean " "--validation_set_split_ratio 0.05 --validate_every_n_epochs 5 --prelogits_norm_loss_factor 5e-4 " "--epoch_size %s --gpu_memory_fraction 0.8; cp -r %s/logs %s/logs" % (str(dataset_dir), str(dataset_dir), str(casia_align_data), str(dataset_dir), learning_rate, train_steps, batch_size, str(dataset_dir), str(output_dir)) ]).add_resource_limit("nvidia.com/gpu", 1) #.add_resource_limit("aliyun.com/gpu-mem", 2) train.after(align_dataset) transform_model = dsl.ContainerOp( name='transform_model', #file_outputs={'output': '/output.txt'}, image='aiven86/facenet-tensorflow:1.13.1-gpu-py3', command=[ "/bin/sh", "-c", "MODEL_DIR=`ls %s/models/facenet/`;cd /facenet;" "python src/freeze_graph.py %s/models/facenet/$MODEL_DIR %s/models/facenet/$MODEL_DIR/$MODEL_DIR.pb;" "cp -r %s/models %s/models;echo $MODEL_DIR > /output.txt;cat /output.txt" % (str(dataset_dir), str(dataset_dir), str(dataset_dir), str(dataset_dir), str(output_dir)) ]).add_resource_limit("nvidia.com/gpu", 1) transform_model.after(train) ran_str = ''.join(random.sample('zyxwvutsrqponmlkjihgfedcba0123456789', 5)) tf_serv_service_name = "face-recognition-service-" + ran_str cluster_name = "face-recognition-pipeline-" + ran_str serve_args = [ '--model-export-path', "/mnt/models/facenet/", '--server-name', tf_serv_service_name ] if platform == 'onprem': serve_args.extend( ['--cluster-name', cluster_name, '--pvc-name', output_pvc_name]) serve = dsl.ContainerOp( name='serve', image='aiven86/ml-pipeline_ml-pipeline-kubeflow-deployer:' '7775692adf28d6f79098e76e839986c9ee55dd61', arguments=serve_args) serve.after(transform_model) model_name = str(transform_model.output) tensorboard_args = [ '--image', 'tensorflow/tensorflow:1.13.1', '--name', 'face-tensorboard-' + ran_str, '--container-port', '6006', '--service-port', '9000', '--service-type', "NodePort", '--pvc-name', output_pvc_name, '--cmd', '["/usr/local/bin/tensorboard","--logdir=/mnt/logs/facenet","--port=6006"]', '--public-ip', public_ip, ] if platform == 'onprem': tensorboard_args.extend(['--cluster-name', cluster_name]) tensorboard = dsl.ContainerOp( name='tensorboard', image='aiven86/kubeflow-examples_face_deploy-service:tensorboard', arguments=tensorboard_args).set_image_pull_policy('IfNotPresent') tensorboard.after(serve) webui_args = [ '--image', 'aiven86/tf-face-recognition:1.0', '--name', 'face-web-ui-' + ran_str, '--container-port', '5000', '--service-port', '5000', '--service-type', "NodePort", '--pvc-name', output_pvc_name, '--model-file-name', '/mnt/models/facenet/%s/%s.pb' % (model_name, model_name), '--tf-serving-host', tf_serv_service_name, '--public-ip', public_ip, ] if platform == 'onprem': webui_args.extend(['--cluster-name', cluster_name]) web_ui = dsl.ContainerOp( name='web_ui', image='aiven86/kubeflow-examples_face_deploy-service:web-ui', arguments=webui_args).set_image_pull_policy('IfNotPresent') web_ui.after(serve) steps = [align_dataset, train, transform_model, serve, tensorboard, web_ui] for step in steps: step.apply( onprem.mount_pvc(data_pvc_name, 'dataset-storage', dataset_dir)) step.apply( onprem.mount_pvc(output_pvc_name, 'output-storage', output_dir)) if step in [align_dataset, train, transform_model]: step.container.add_env_variable( V1EnvVar(name='PYTHONPATH', value=PYTHONPATH))