def my_pipeline( minio_endpoint='minio-service:9000', log_bucket='mlpipeline', log_dir=f'tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}', # Pin to tensorflow 2.3, because in 2.4+ tensorboard cannot load in KFP: # refer to https://github.com/kubeflow/pipelines/issues/5521. tf_image='gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest' ): # tensorboard uses s3 protocol to access minio prepare_tb_task = prepare_tensorboard( log_dir_uri=f's3://{log_bucket}/{log_dir}', image=tf_image, pod_template_spec=json.dumps({ 'spec': { 'containers': [{ # These env vars make tensorboard access KFP in-cluster minio # using s3 protocol. # Reference: https://blog.min.io/hyper-scale-machine-learning-with-minio-and-tensorflow/ 'env': [{ 'name': 'AWS_ACCESS_KEY_ID', 'valueFrom': { 'secretKeyRef': { 'name': 'mlpipeline-minio-artifact', 'key': 'accesskey' } } }, { 'name': 'AWS_SECRET_ACCESS_KEY', 'valueFrom': { 'secretKeyRef': { 'name': 'mlpipeline-minio-artifact', 'key': 'secretkey' } } }, { 'name': 'AWS_REGION', 'value': 'minio' }, { 'name': 'S3_ENDPOINT', 'value': f'{minio_endpoint}', }, { 'name': 'S3_USE_HTTPS', 'value': '0', }, { 'name': 'S3_VERIFY_SSL', 'value': '0', }] }], }, }) ) train_task = train_op( minio_endpoint=minio_endpoint, log_bucket=log_bucket, log_dir=log_dir, ) train_task.apply( use_k8s_secret( secret_name='mlpipeline-minio-artifact', k8s_secret_key_to_env={ 'secretkey': 'MINIO_SECRET_KEY', 'accesskey': 'MINIO_ACCESS_KEY' }, ) ) # optional, let training task use the same tensorflow image as specified tensorboard train_task.container.image = tf_image train_task.after(prepare_tb_task)
def pytorch_bert( # pylint: disable=too-many-arguments minio_endpoint=MINIO_ENDPOINT, log_bucket=LOG_BUCKET, log_dir=f"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}", mar_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/model-store", config_prop_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/config", model_uri=f"s3://mlpipeline/mar/{dsl.RUN_ID_PLACEHOLDER}", tf_image=TENSORBOARD_IMAGE, deploy=DEPLOY_NAME, namespace=NAMESPACE, confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/", num_samples=1000, max_epochs=1): """Thid method defines the pipeline tasks and operations""" prepare_tb_task = prepare_tensorboard_op( log_dir_uri=f"s3://{log_bucket}/{log_dir}", image=tf_image, pod_template_spec=json.dumps({ "spec": { "containers": [{ "env": [ { "name": "AWS_ACCESS_KEY_ID", "valueFrom": { "secretKeyRef": { "name": "mlpipeline-minio-artifact", "key": "accesskey", } }, }, { "name": "AWS_SECRET_ACCESS_KEY", "valueFrom": { "secretKeyRef": { "name": "mlpipeline-minio-artifact", "key": "secretkey", } }, }, { "name": "AWS_REGION", "value": "minio" }, { "name": "S3_ENDPOINT", "value": f"{minio_endpoint}", }, { "name": "S3_USE_HTTPS", "value": "0" }, { "name": "S3_VERIFY_SSL", "value": "0" }, ] }] } }), ).set_display_name("Visualization") prep_task = (prep_op().after(prepare_tb_task).set_display_name( "Preprocess & Transform")) confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}" script_args = f"model_name=bert.pth," \ f"num_samples={num_samples}," \ f"confusion_matrix_url={confusion_matrix_url}" # For gpus, set number of gpus and accelerator type ptl_args = f"max_epochs={max_epochs}," \ "profiler=pytorch," \ "gpus=0," \ "accelerator=None" train_task = (train_op( input_data=prep_task.outputs["output_data"], script_args=script_args, ptl_arguments=ptl_args).after(prep_task).set_display_name("Training")) # For GPU uncomment below line and set GPU limit and node selector # ).set_gpu_limit(1).add_node_selector_constraint # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4') (minio_op( bucket_name="mlpipeline", folder_name=log_dir, input_path=train_task.outputs["tensorboard_root"], filename="", ).after(train_task).set_display_name("Tensorboard Events Pusher")) minio_mar_upload = (minio_op( bucket_name="mlpipeline", folder_name=mar_path, input_path=train_task.outputs["checkpoint_dir"], filename="bert_test.mar", ).after(train_task).set_display_name("Mar Pusher")) (minio_op( bucket_name="mlpipeline", folder_name=config_prop_path, input_path=train_task.outputs["checkpoint_dir"], filename="config.properties", ).after(train_task).set_display_name("Conifg Pusher")) model_uri = str(model_uri) # pylint: disable=unused-variable isvc_yaml = """ apiVersion: "serving.kubeflow.org/v1beta1" kind: "InferenceService" metadata: name: {} namespace: {} spec: predictor: serviceAccountName: sa pytorch: storageUri: {} resources: limits: memory: 4Gi """.format(deploy, namespace, model_uri) # For GPU inference use below yaml with gpu count and accelerator gpu_count = "1" accelerator = "nvidia-tesla-p4" isvc_gpu_yaml = """ apiVersion: "serving.kubeflow.org/v1beta1" kind: "InferenceService" metadata: name: {} namespace: {} spec: predictor: serviceAccountName: sa pytorch: storageUri: {} resources: limits: memory: 4Gi nvidia.com/gpu: {} nodeSelector: cloud.google.com/gke-accelerator: {} """.format(deploy, namespace, model_uri, gpu_count, accelerator) # Update inferenceservice_yaml for GPU inference deploy_task = (deploy_op( action="apply", inferenceservice_yaml=isvc_yaml).after( minio_mar_upload).set_display_name("Deployer")) dsl.get_pipeline_conf().add_op_transformer( use_k8s_secret( secret_name="mlpipeline-minio-artifact", k8s_secret_key_to_env={ "secretkey": "MINIO_SECRET_KEY", "accesskey": "MINIO_ACCESS_KEY", }, ))
def pytorch_bert( minio_endpoint="http://minio-service.kubeflow:9000", log_bucket="mlpipeline", log_dir=f"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}", mar_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/model-store", config_prop_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/config", model_uri=f"s3://mlpipeline/mar/{dsl.RUN_ID_PLACEHOLDER}", tf_image="jagadeeshj/tb_plugin:v1.8", deploy="bertserve", namespace="kubeflow-user-example-com", confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/", num_samples=1000): prepare_tb_task = prepare_tensorboard_op( log_dir_uri=f"s3://{log_bucket}/{log_dir}", image=tf_image, pod_template_spec=json.dumps({ "spec": { "containers": [{ "env": [ { "name": "AWS_ACCESS_KEY_ID", "valueFrom": { "secretKeyRef": { "name": "mlpipeline-minio-artifact", "key": "accesskey", } }, }, { "name": "AWS_SECRET_ACCESS_KEY", "valueFrom": { "secretKeyRef": { "name": "mlpipeline-minio-artifact", "key": "secretkey", } }, }, { "name": "AWS_REGION", "value": "minio" }, { "name": "S3_ENDPOINT", "value": f"{minio_endpoint}" }, { "name": "S3_USE_HTTPS", "value": "0" }, { "name": "S3_VERIFY_SSL", "value": "0" }, ] }] } }), ).set_display_name("Visualization") prep_task = prep_op().after(prepare_tb_task).set_display_name( "Preprocess & Transform") train_task = (train_op( input_data=prep_task.outputs["output_data"], profiler="pytorch", confusion_matrix_url=f"minio://{log_bucket}/{confusion_matrix_log_dir}", num_samples=num_samples).apply( use_k8s_secret( secret_name="mlpipeline-minio-artifact", k8s_secret_key_to_env={ "secretkey": "MINIO_SECRET_KEY", "accesskey": "MINIO_ACCESS_KEY", }, )).after(prep_task).set_display_name("Training")) minio_tb_upload = (minio_op( bucket_name="mlpipeline", folder_name=log_dir, input_path=train_task.outputs["tensorboard_root"], filename="", ).apply( use_k8s_secret( secret_name="mlpipeline-minio-artifact", k8s_secret_key_to_env={ "secretkey": "MINIO_SECRET_KEY", "accesskey": "MINIO_ACCESS_KEY", }, )).after(train_task).set_display_name("Tensorboard Events Pusher")) minio_mar_upload = (minio_op( bucket_name="mlpipeline", folder_name=mar_path, input_path=train_task.outputs["checkpoint_dir"], filename="bert_test.mar", ).apply( use_k8s_secret( secret_name="mlpipeline-minio-artifact", k8s_secret_key_to_env={ "secretkey": "MINIO_SECRET_KEY", "accesskey": "MINIO_ACCESS_KEY", }, )).after(train_task).set_display_name("Mar Pusher")) minio_config_upload = (minio_op( bucket_name="mlpipeline", folder_name=config_prop_path, input_path=train_task.outputs["checkpoint_dir"], filename="config.properties", ).apply( use_k8s_secret( secret_name="mlpipeline-minio-artifact", k8s_secret_key_to_env={ "secretkey": "MINIO_SECRET_KEY", "accesskey": "MINIO_ACCESS_KEY", }, )).after(train_task).set_display_name("Conifg Pusher")) model_uri = str(model_uri) isvc_yaml = """ apiVersion: "serving.kubeflow.org/v1beta1" kind: "InferenceService" metadata: name: {} namespace: {} spec: predictor: serviceAccountName: sa pytorch: storageUri: {} resources: limits: memory: 4Gi """.format(deploy, namespace, model_uri) deploy_task = (deploy_op( action="apply", inferenceservice_yaml=isvc_yaml).after( minio_mar_upload).set_display_name("Deployer"))