def santander_transaction_classification(
    output,
    project,
):
    tf_server_name = 'kfdemo-service'

    if platform != 'GCP':
        vop = dsl.VolumeOp(name="create_pvc",
                           resource_name="pipeline-pvc",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="1Gi")

        checkout = dsl.ContainerOp(
            name="checkout",
            image="alpine/git:latest",
            command=[
                "git", "clone", "https://github.com/kubeflow/pipelines.git",
                str(output) + "/pipelines"
            ],
        ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
        checkout.after(vop)

    if platform == 'GCP':
        deploy = kubeflow_deploy_op(model_dir=str(
            'gs://kubeflow-pipelines-demo/tfx/0b22081a-ed94-11e9-81fb-42010a800160/santander-customer-transaction-prediction-95qxr-268134926/data'
        ) + '/export/export',
                                    server_name=tf_server_name)
    else:
        deploy = kubeflow_deploy_op(
            cluster_name=project,
            model_dir=str(
                'gs://kubeflow-pipelines-demo/tfx/0b22081a-ed94-11e9-81fb-42010a800160/santander-customer-transaction-prediction-95qxr-268134926/data'
            ) + '/export/export',
            pvc_name=vop.outputs["name"],
            server_name=tf_server_name)

    webapp = dsl.ContainerOp(
        name='webapp',
        image='us.gcr.io/kf-pipelines/ml-pipeline-webapp-launcher:v0.3',
        arguments=["--model_name", 'santanderapp'])
    webapp.after(deploy)

    steps = [deploy, webapp]

    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(
                onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def notebook_pipeline():
    """A pipeline to run a Jupyter notebook with elyra-ai/kfp-notebook and Papermill."""

    from kfp_notebook.pipeline import NotebookOp

    notebook_op = NotebookOp(name="${name}",
                             notebook="${notebook}",
                             cos_endpoint="${cos_endpoint}",
                             cos_bucket="${cos_bucket}",
                             cos_directory="${cos_directory}",
                             cos_dependencies_archive="${cos_dependencies_archive}",
                             requirements_url="${requirements_url}",
                             image="${image}")

    from kubernetes.client.models import V1EnvVar

    notebook_op.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value="${cos_username}"))
    notebook_op.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value="${cos_password}"))

    from kfp import onprem

    notebook_op.container.add_env_variable(V1EnvVar(name='DATA_DIR', value="${mount_path}"))
    notebook_op.apply(onprem.mount_pvc(pvc_name='${dataset_pvc}',
                                       volume_name='${dataset_pvc}',
                                       volume_mount_path='${mount_path}'))
Beispiel #3
0
def fuel_pipeline(bucket_name='gs://your-bucket/export',
                  input_file='folder/file',
                  output_folder='output folder',
                  epochs=10):
    preprocess = dsl.ContainerOp(name='preprocess',
                                 image='gcr.io/kb-poc-262417/fuel:latest',
                                 arguments=[
                                     '--input_file', input_file,
                                     '--output_folder', output_folder,
                                     '--bucket_name', bucket_name
                                 ])

    train = dsl.ContainerOp(
        name='train',
        image='gcr.io/kb-poc-262417/fuel/train:latest',
        arguments=['--bucket_name', bucket_name, '--epochs', epochs])
    train.after(preprocess)

    serve = dsl.ContainerOp(name='serve',
                            image='gcr.io/kb-poc-262417/fuel/serve:latest',
                            arguments=['--bucket_name', bucket_name])

    serve.after(train)

    steps = [preprocess, train, serve]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
Beispiel #4
0
def pipeline(gs_bucket='gs://your-bucket/export', 
		   input_file_with_folder='input/churn.csv',
		   output_folder = 'output',
		   optimizer_name='RMSProp', 
		   learning_rate=0.003,
		   momentum=0.01,
		   model_dir='gs://your-bucket/export', 
		   model_name='dummy',
		   server_name='dummy'):
		   
  preprocess_args = [
  		'--bucket_name', gs_bucket, 
  		'--input_file_with_folder', input_file_with_folder, 
  		'--output', output_folder
  ]
  preprocess = dsl.ContainerOp(
      name='preprocess',
      image='gcr.io/kube-2020/churn/preprocess:latest',
      arguments= preprocess_args
  )


  train_args = [
  		'--bucket_name', gs_bucket, 
  		'--output_folder', output_folder, 
  		'--optimizer_name', optimizer_name,
  		'--learning_rate', learning_rate, 
  		'--momentum' , momentum
  ]
  train = dsl.ContainerOp(
      name='train',
      image='gcr.io/kube-2020/churn/train:latest',
      arguments= train_args
  )

  serve_args = [
      '--model_path', model_dir,
      '--model_name', model_name,
      '--server_name', server_name
  ]

  serve = dsl.ContainerOp(
      name='serve',
      image='gcr.io/kube-2020/churn/pipeline/deployer:latest',
      arguments=serve_args
  )

  steps = [preprocess, train, serve]
  for step in steps:
    if platform == 'GCP':
      step.apply(gcp.use_gcp_secret('user-gcp-sa'))
    else:
      step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))

  train.after(preprocess)
  serve.after(train)
def mnist_pipeline(model_export_dir='gs://your-bucket/export',
                   train_steps='200',
                   learning_rate='0.01',
                   batch_size='100',
                   pvc_name=''):
    """
    Pipeline with three stages:
      1. train an MNIST classifier
      2. deploy a tf-serving instance to the cluster
      3. deploy a web-ui to interact with it
    """
    train = dsl.ContainerOp(
        name='train',
        image=
        'gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b',
        arguments=[
            "/opt/model.py", "--tf-export-dir", model_export_dir,
            "--tf-train-steps", train_steps, "--tf-batch-size", batch_size,
            "--tf-learning-rate", learning_rate
        ])

    serve_args = [
        '--model-export-path', model_export_dir, '--server-name',
        "mnist-service"
    ]
    if platform != 'GCP':
        serve_args.extend(
            ['--cluster-name', "mnist-pipeline", '--pvc-name', pvc_name])

    serve = dsl.ContainerOp(
        name='serve',
        image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:'
        'e9b96de317989a9673ef88d88fb9dab9dac3005f',
        arguments=serve_args)
    serve.after(train)

    web_ui = dsl.ContainerOp(
        name='web-ui',
        image='brightfly/kubeflow-deploy-service:handson',
        arguments=[
            '--image',
            'gcr.io/kubeflow-examples/mnist/web-ui:v20190304-v0.2-176-g15d997b-pipelines',
            '--name', 'web-ui', '--container-port', '5000', '--service-port',
            '80', '--service-type', "LoadBalancer", '--cluster-name',
            "mnist-pipeline"
        ])
    web_ui.after(serve)

    steps = [train, serve, web_ui]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
Beispiel #6
0
def train_pipeline(output="/mnt/model.h5",
                   result="/mnt/results.txt",
                   pvc_name="train-vol",
                   pvc_path="/mnt",
                   epochs=30,
                   validations=10,
                   trainset='/cut',
                   testset='/cut',
                   input='/train.csv',
                   filenames='id',
                   target='has_scratch',
                   train_size=0.8,
                   learn_rate=0.0001,
                   workers=2):
    train = train_op(epochs, validations, workers, pvc_path, trainset, input,
                     filenames, target, train_size, learn_rate, output).apply(
                         onprem.mount_pvc("train-vol", 'local-storage',
                                          "/mnt"))
    load = load_op(workers, pvc_path, testset, input, filenames, target,
                   train.outputs['output'], result).apply(
                       onprem.mount_pvc("train-vol", 'local-storage', "/mnt"))
def mnist_pipeline(model_export_dir='gs://kf-test1234/export',
                   project='<your project id>',
                   bucket_name='kf-test1234',
                   n_class='10',
                   pvc_name=''):
    test = _test(project, bucket_name, n_class, model_export_dir)

    steps = [test]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
Beispiel #8
0
def email_pipeline(
    server_secret="server-secret",
    subject="Hi, again!",
    body="Tekton email",
    sender="*****@*****.**",
    recipients="[email protected], [email protected]",
    attachment_filepath="/tmp/data/output.txt"
):
    email = email_op(server_secret=server_secret,
                     subject=subject,
                     body=body,
                     sender=sender,
                     recipients=recipients,
                     attachment_path=attachment_filepath)
    email.add_env_variable(env_from_secret('USER', '$(params.server_secret)', 'user'))
    email.add_env_variable(env_from_secret('PASSWORD', '$(params.server_secret)', 'password'))
    email.add_env_variable(env_from_secret('TLS', '$(params.server_secret)', 'tls'))
    email.add_env_variable(env_from_secret('SERVER', '$(params.server_secret)', 'url'))
    email.add_env_variable(env_from_secret('PORT', '$(params.server_secret)', 'port'))
    email.apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path))

    with dsl.ExitHandler(email):
        write_file_task = write_file(attachment_filepath).apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path))
Beispiel #9
0
def pipeline(dataset_location='/mnt/data/manipulated_fashion_mnist.csv', test_size=0.3, random_state=42, input_shape_height=28, input_shape_width=28, use_pretrained_model='False', model_units_num=128, model_outputs_num=10, model_activation_func_layer2='relu', model_activation_func_layer3='softmax', optimizer='adam', loss='binary_crossentropy', metrics='accuracy', num_epochs=10, location_prepared_dataset='/mnt/data/prep_fashion_mnist.csv', location_improved_dataset='/mnt/data/impr_fasion_mnist.csv', location_training_images='/mnt/data/train_img.csv', location_training_labels='/mnt/data/train_labels.csv', location_test_images='/mnt/data/test_img.csv', location_test_labels='/mnt/data/test_labels.csv', location_base_model='/mnt/model/base_model.h5', location_trained_model='/mnt/model/trained_model.h5'):
  data_preparation = data_prep_op(dataset_location, location_prepared_dataset).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
  feature_engineering = feature_eng_op(data_preparation.outputs['output'], location_improved_dataset).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
  data_split = data_split_op(feature_engineering.outputs['output'], test_size, random_state, location_training_images, location_training_labels, location_test_images, location_test_labels).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
  
  with dsl.Condition(use_pretrained_model == 'True'):
    model_building = model_download_op(input_shape_height, input_shape_width, location_base_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
    model_training = model_train_op(data_split.outputs['train_img'], data_split.outputs['train_label'], input_shape_height, input_shape_width, model_building.outputs['output_model_loc'], num_epochs, location_trained_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
    model_evaluation = model_eval_op(data_split.outputs['test_img'], data_split.outputs['test_label'], input_shape_height, input_shape_width, model_training.outputs['output_model_loc'], '/mlpipeline-ui-metadata.json').apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))


  with dsl.Condition(use_pretrained_model == 'False'):
    model_building = model_build_op(input_shape_height, input_shape_width, model_units_num, model_outputs_num, model_activation_func_layer2, model_activation_func_layer3, optimizer, loss, metrics, location_base_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
    model_training = model_train_op(data_split.outputs['train_img'], data_split.outputs['train_label'], input_shape_height, input_shape_width, model_building.outputs['output_model_loc'], num_epochs, location_trained_model).apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
    model_evaluation = model_eval_op(data_split.outputs['test_img'], data_split.outputs['test_label'], input_shape_height, input_shape_width, model_training.outputs['output_model_loc'], '/mlpipeline-ui-metadata.json').apply(onprem.mount_pvc("fashion-mnist-vol", 'local-storage', "/mnt"))
    def testVolumeMountingPipelineOperatorFuncs(self):
        mount_volume_op = onprem.mount_pvc('my-persistent-volume-claim',
                                           'my-volume-name',
                                           '/mnt/volume-mount-path')
        config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
            pipeline_operator_funcs=[mount_volume_op])

        kubeflow_dag_runner.KubeflowDagRunner(config=config).run(
            _two_step_pipeline())
        file_path = 'two_step_pipeline.tar.gz'
        self.assertTrue(fileio.exists(file_path))

        with tarfile.TarFile.open(file_path).extractfile(
                'pipeline.yaml') as pipeline_file:
            self.assertIsNotNone(pipeline_file)
            pipeline = yaml.safe_load(pipeline_file)

            container_templates = [
                c for c in pipeline['spec']['templates'] if 'container' in c
            ]
            self.assertEqual(2, len(container_templates))

            volumes = [{
                'name': 'my-volume-name',
                'persistentVolumeClaim': {
                    'claimName': 'my-persistent-volume-claim'
                }
            }]

            # Check that the PVC is specified for kfp<=0.1.31.1.
            if 'volumes' in pipeline['spec']:
                self.assertEqual(volumes, pipeline['spec']['volumes'])

            for template in container_templates:
                # Check that each container has the volume mounted.
                self.assertEqual([{
                    'name': 'my-volume-name',
                    'mountPath': '/mnt/volume-mount-path'
                }], template['container']['volumeMounts'])

                # Check that each template has the PVC specified for kfp>=0.1.31.2.
                if 'volumes' in template:
                    self.assertEqual(volumes, template['volumes'])
Beispiel #11
0
def body_parts_pipeline(model_dir='gs://your-bucket/export',
                        model_name='dummy',
                        server_name='dummy'):

    serve_args = [
        '--model_path', model_dir, '--model_name', model_name, '--server_name',
        server_name
    ]

    serve = dsl.ContainerOp(name='serve',
                            image='gcr.io/bigdata-2020/dlaas/body:latest',
                            arguments=serve_args)

    steps = [serve]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
    def testVolumeMountingPipelineOperatorFuncs(self):
        mount_volume_op = onprem.mount_pvc('my-persistent-volume-claim',
                                           'my-volume-name',
                                           '/mnt/volume-mount-path')
        config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
            pipeline_operator_funcs=[mount_volume_op])

        kubeflow_dag_runner.KubeflowDagRunner(config=config).run(
            _two_step_pipeline())
        file_path = os.path.join(self.test_dir, 'two_step_pipeline.tar.gz')
        self.assertTrue(tf.gfile.Exists(file_path))

        with tarfile.TarFile.open(file_path).extractfile(
                'pipeline.yaml') as pipeline_file:
            self.assertIsNotNone(pipeline_file)
            pipeline = yaml.load(pipeline_file)

            containers = [
                c for c in pipeline['spec']['templates'] if 'container' in c
            ]
            self.assertEqual(2, len(containers))

            # Check that each container has the volume mounted.
            self.assertEqual([{
                'name': 'my-volume-name',
                'mountPath': '/mnt/volume-mount-path'
            }], containers[0]['container']['volumeMounts'])

            self.assertEqual([{
                'name': 'my-volume-name',
                'mountPath': '/mnt/volume-mount-path'
            }], containers[1]['container']['volumeMounts'])

            # Check that the PVC is specified.
            self.assertEqual([{
                'name': 'my-volume-name',
                'persistentVolumeClaim': {
                    'claimName': 'my-persistent-volume-claim'
                }
            }], pipeline['spec']['volumes'])
Beispiel #13
0
def mnist_pipeline(
            experiment_name = 'mnist', 
            namespace = 'kubeflow',
            gs_bucket='gs://your-bucket/export', 
		    epochs=10, 
		    batch_size=128,
		    model_dir='gs://your-bucket/export', 
		    model_name='dummy',
		    server_name='dummy'):



    train_args = [
  		'--bucket_name', gs_bucket, 
  		'--epochs', epochs, 
    ]

    convert_op1 = func_to_container_op(step1)
    op1 = convert_op1()


    convert_op = func_to_container_op(convert_mnist_experiment_result)
    op2 = convert_op(op1.output, gs_bucket, epochs)

    train = dsl.ContainerOp(
        name='train',
        image='gcr.io/dais-data-dev-txwj/mnist/train:latest',
        arguments=op2.output
    )

    steps = [op1, op2, train]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))

    op2.after(op1)
    train.after(op2)
def main(unused_argv):
    serving_model_dir = os.path.join(FLAGS.project_root, 'serving_model',
                                     FLAGS.pipeline_name)

    module_file = os.path.join(FLAGS.project_root, 'titanic_keras_utils.py')
    # Root directory to store pipeline artifacts.
    pipeline_root = os.path.join(FLAGS.project_root, 'pipeline')
    metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config(
    )

    # This pipeline automatically injects the Kubeflow TFX image if the
    # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx
    # cli tool exports the environment variable to pass to the pipelines.
    tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None)

    runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
        kubeflow_metadata_config=metadata_config,
        # Specify custom docker image to use.
        tfx_image=tfx_image,
        pipeline_operator_funcs=(
            # If running on K8s Engine (GKE) on Google Cloud Platform (GCP),
            # kubeflow_dag_runner.get_default_pipeline_operator_funcs() provides
            # default configurations specifically for GKE on GCP, such as secrets.
            [
                onprem.mount_pvc(_persistent_volume_claim, _persistent_volume,
                                 _persistent_volume_mount)
            ]))

    kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run(
        create_tfx_pipeline(
            pipeline_name=FLAGS.pipeline_name,
            pipeline_root=pipeline_root,
            data_root=FLAGS.data_root,
            module_file=module_file,
            serving_model_dir=serving_model_dir,
            # 0 means auto-detect based on on the number of CPUs available during
            # execution time.
            direct_num_workers=0))
Beispiel #15
0
def mnist_pipeline(gs_bucket='gs://your-bucket/export', 
		   epochs=10, 
		   batch_size=128,
		   model_dir='gs://your-bucket/export', 
		   model_name='dummy',
		   server_name='dummy'):
		   
  train_args = [
  		'--bucket_name', gs_bucket, 
  		'--epochs', epochs, 
  		'--batch_size', batch_size
  ]
  train = dsl.ContainerOp(
      name='train',
      image='gcr.io/kb-poc-262417/mnist/train:latest',
      arguments= train_args
  )

  serve_args = [
      '--model_path', model_dir,
      '--model_name', model_name,
      '--server_name', server_name
  ]

  serve = dsl.ContainerOp(
      name='serve',
      image='gcr.io/kb-poc-262417/mnist/pipeline/deployer:latest',
      arguments=serve_args
  )

  steps = [train, serve]
  for step in steps:
    if platform == 'GCP':
      step.apply(gcp.use_gcp_secret('user-gcp-sa'))
    else:
      step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))

  serve.after(train)
def mnist_train_pipeline(model_export_dir='gs://your-bucket/export',
                         train_steps='200',
                         learning_rate='0.01',
                         batch_size='100',
                         pvc_name=''):
    """
  Pipeline with three stages:
    1. train an MNIST classifier
    2. deploy a tf-serving instance to the cluster
    3. deploy a web-ui to interact with it
  """
    train = dsl.ContainerOp(name='train',
                            image='TRAIN_IMG',
                            arguments=[
                                "/opt/model.py", "--tf-export-dir",
                                model_export_dir, "--tf-train-steps",
                                train_steps, "--tf-batch-size", batch_size,
                                "--tf-learning-rate", learning_rate
                            ])

    push = dsl.ContainerOp(name='push',
                           image='PUSH_IMG',
                           arguments=[
                               "/opt/entrypoint.sh",
                               "PUSH_REPO",
                               "PUSH_LOGIN",
                               "PUSH_PASS",
                               "PUSH_SHA",
                           ])
    push.after(train)

    steps = [train, push]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
Beispiel #17
0
def ai_training_run(
        # Define variables that the user can set in the pipelines UI; set default values
        dataset_volume_pvc_existing: str = "dataset-vol",
        trained_model_volume_pvc_existing: str = "kfp-model-vol",
        execute_data_prep_step__yes_or_no: str = "yes",
        data_prep_step_container_image:
    str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3",
        data_prep_step_command: str = "<insert command here>",
        data_prep_step_dataset_volume_mountpoint: str = "/mnt/dataset",
        train_step_container_image:
    str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3",
        train_step_command: str = "<insert command here>",
        train_step_dataset_volume_mountpoint: str = "/mnt/dataset",
        train_step_model_volume_mountpoint: str = "/mnt/model",
        validation_step_container_image:
    str = "nvcr.io/nvidia/tensorflow:21.03-tf1-py3",
        validation_step_command: str = "<insert command here>",
        validation_step_dataset_volume_mountpoint: str = "/mnt/dataset",
        validation_step_model_volume_mountpoint: str = "/mnt/model"):
    # Set GPU limits; Due to SDK limitations, this must be hardcoded
    train_step_num_gpu = 0
    validation_step_num_gpu = 0

    # Pipeline Steps:

    # Execute data prep step
    with dsl.Condition(execute_data_prep_step__yes_or_no == "yes"):
        data_prep = dsl.ContainerOp(name="data-prep",
                                    image=data_prep_step_container_image,
                                    command=["sh", "-c"],
                                    arguments=[data_prep_step_command])
        # Mount dataset volume/pvc
        data_prep.apply(
            onprem.mount_pvc(dataset_volume_pvc_existing, 'dataset',
                             data_prep_step_dataset_volume_mountpoint))

    # Create a snapshot of the dataset volume/pvc for traceability
    volume_snapshot_name = "dataset-{{workflow.uid}}"
    dataset_snapshot = dsl.ContainerOp(
        name="dataset-snapshot",
        image="python:3",
        command=["/bin/bash", "-c"],
        arguments=[
            "\
            python3 -m pip install netapp-dataops-k8s && \
            echo '" + volume_snapshot_name +
            "' > /volume_snapshot_name.txt && \
            netapp_dataops_k8s_cli.py create volume-snapshot --pvc-name=" +
            str(dataset_volume_pvc_existing) + " --snapshot-name=" +
            str(volume_snapshot_name) + " --namespace={{workflow.namespace}}"
        ],
        file_outputs={"volume_snapshot_name": "/volume_snapshot_name.txt"})
    # State that snapshot should be created after the data prep job completes
    dataset_snapshot.after(data_prep)

    # Execute training step
    train = dsl.ContainerOp(name="train-model",
                            image=train_step_container_image,
                            command=["sh", "-c"],
                            arguments=[train_step_command])
    # Mount dataset volume/pvc
    train.apply(
        onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol',
                         train_step_dataset_volume_mountpoint))
    # Mount model volume/pvc
    train.apply(
        onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol',
                         train_step_model_volume_mountpoint))
    # Request that GPUs be allocated to training job pod
    if train_step_num_gpu > 0:
        train.set_gpu_limit(train_step_num_gpu, 'nvidia')
    # State that training job should be executed after dataset volume snapshot is taken
    train.after(dataset_snapshot)

    # Create a snapshot of the model volume/pvc for model versioning
    volume_snapshot_name = "kfp-model-{{workflow.uid}}"
    model_snapshot = dsl.ContainerOp(
        name="model-snapshot",
        image="python:3",
        command=["/bin/bash", "-c"],
        arguments=[
            "\
            python3 -m pip install netapp-dataops-k8s && \
            echo '" + volume_snapshot_name +
            "' > /volume_snapshot_name.txt && \
            netapp_dataops_k8s_cli.py create volume-snapshot --pvc-name=" +
            str(trained_model_volume_pvc_existing) + " --snapshot-name=" +
            str(volume_snapshot_name) + " --namespace={{workflow.namespace}}"
        ],
        file_outputs={"volume_snapshot_name": "/volume_snapshot_name.txt"})
    # State that snapshot should be created after the training job completes
    model_snapshot.after(train)

    # Execute inference validation job
    inference_validation = dsl.ContainerOp(
        name="validate-model",
        image=validation_step_container_image,
        command=["sh", "-c"],
        arguments=[validation_step_command])
    # Mount dataset volume/pvc
    inference_validation.apply(
        onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol',
                         validation_step_dataset_volume_mountpoint))
    # Mount model volume/pvc
    inference_validation.apply(
        onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol',
                         validation_step_model_volume_mountpoint))
    # Request that GPUs be allocated to pod
    if validation_step_num_gpu > 0:
        inference_validation.set_gpu_limit(validation_step_num_gpu, 'nvidia')
    # State that inference validation job should be executed after model volume snapshot is taken
    inference_validation.after(model_snapshot)
Beispiel #18
0
def mnist_pipeline():
    ENV_MANAGE_URL = V1EnvVar(name='MANAGE_URL', value='http://220.116.228.93:8088/send')

    data_0 = dsl.ContainerOp(
        name="load & preprocess data pipeline",
        image="byeongjokim/mnist-pre-data:latest",
    ).set_display_name('collect & preprocess data')\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))

    data_1 = dsl.ContainerOp(
        name="validate data pipeline",
        image="byeongjokim/mnist-val-data:latest",
    ).set_display_name('validate data').after(data_0)\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))

    train_model = dsl.ContainerOp(
        name="train embedding model",
        image="byeongjokim/mnist-train-model:latest",
    ).set_display_name('train model').after(data_1)\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\
    .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))

    embedding = dsl.ContainerOp(
        name="embedding data using embedding model",
        image="byeongjokim/mnist-embedding:latest",
    ).set_display_name('embedding').after(train_model)\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\
    .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))

    train_faiss = dsl.ContainerOp(
        name="train faiss",
        image="byeongjokim/mnist-train-faiss:latest",
    ).set_display_name('train faiss').after(embedding)\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\
    .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))

    analysis = dsl.ContainerOp(
        name="analysis total",
        image="byeongjokim/mnist-analysis:latest",
        file_outputs={
            "confusion_matrix": "/confusion_matrix.csv",
            "mlpipeline-ui-metadata": "/mlpipeline-ui-metadata.json",
            "accuracy": "/accuracy.json",
            "mlpipeline_metrics": "/mlpipeline-metrics.json"
        }
    ).add_env_variable(ENV_MANAGE_URL).set_display_name('analysis').after(train_faiss)\
    .apply(onprem.mount_pvc("data-pvc", volume_name="data", volume_mount_path="/data"))\
    .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))

    baseline = 0.8
    with dsl.Condition(analysis.outputs["accuracy"] > baseline) as check_deploy:
        deploy = dsl.ContainerOp(
            name="deploy mar",
            image="byeongjokim/mnist-deploy:latest",
        ).add_env_variable(ENV_MANAGE_URL).set_display_name('deploy').after(analysis)\
        .apply(onprem.mount_pvc("train-model-pvc", volume_name="train-model", volume_mount_path="/model"))\
        .apply(onprem.mount_pvc("deploy-model-pvc", volume_name="deploy-model", volume_mount_path="/deploy-model"))
def santander_transaction_classification(
        output,
        project,
        train='gs://kubeflow-pipelines-demo/dataset/train.csv',
        evaluation='gs://kubeflow-pipelines-demo/dataset/test.csv',
        mode='local',
        preprocess_module='gs://kubeflow-pipelines-demo/dataset/preprocessing.py',
        learning_rate=0.1,
        hidden_layer_size='1500',
        steps=3000):
    output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data'
    target_class_lambda = """lambda x: x['target']"""

    tf_server_name = 'kfdemo-service'

    if platform != 'GCP':
        vop = dsl.VolumeOp(name="create_pvc",
                           resource_name="pipeline-pvc",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="1Gi")

        checkout = dsl.ContainerOp(
            name="checkout",
            image="alpine/git:latest",
            command=[
                "git", "clone", "https://github.com/kubeflow/pipelines.git",
                str(output) + "/pipelines"
            ],
        ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
        checkout.after(vop)

    preprocess = dataflow_tf_transform_op(
        training_data_file_pattern=train,
        evaluation_data_file_pattern=evaluation,
        schema="not.txt",
        gcp_project=project,
        run_mode=mode,
        preprocessing_module=preprocess_module,
        transformed_data_dir=output_template)

    training = tf_train_op(transformed_data_dir=preprocess.output,
                           schema='not.txt',
                           learning_rate=learning_rate,
                           hidden_layer_size=hidden_layer_size,
                           steps=steps,
                           target='tips',
                           preprocessing_module=preprocess_module,
                           training_output_dir=output_template)

    prediction = dataflow_tf_predict_op(
        data_file_pattern=evaluation,
        schema='not.txt',
        target_column='tips',
        model=training.outputs['training_output_dir'],
        run_mode=mode,
        gcp_project=project,
        predictions_dir=output_template)

    cm = confusion_matrix_op(predictions=prediction.outputs['predictions_dir'],
                             output_dir=output_template)

    roc = roc_op(predictions_dir=prediction.outputs['predictions_dir'],
                 target_lambda=target_class_lambda,
                 output_dir=output_template)

    steps = [training, prediction, cm, roc]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(
                onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
Beispiel #20
0
def taxi_cab_classification(
    project,
    output='s3://mlpipeline/tfx/output',
    column_names='s3://mlpipeline/tfx/taxi-cab-classification/column-names.json',
    key_columns='trip_start_timestamp',
    train='s3://mlpipeline/tfx/taxi-cab-classification/train.csv',
    evaluation='s3://mlpipeline/tfx/taxi-cab-classification/eval.csv',
    mode='local',
    preprocess_module='s3://mlpipeline/tfx/taxi-cab-classification/preprocessing.py',
    learning_rate=0.1,
    hidden_layer_size='1500',
    steps=3000,
    analyze_slice_column='trip_start_hour'
):
    output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data'
    target_lambda = """lambda x: (x['target'] > x['fare'] * 0.2)"""
    target_class_lambda = """lambda x: 1 if (x['target'] > x['fare'] * 0.2) else 0"""

    tf_server_name = 'taxi-cab-classification-model-{{workflow.uid}}'

    if platform == 'onprem':
        if storage == 'minio':
            data_preparation = dsl.ContainerOp(
                name="data_preparation",
                image="aiven86/minio_mc-git",
                command=["sh", "/bin/run.sh"],
            ).set_image_pull_policy('IfNotPresent')
            data_preparation.container.add_env_variable(V1EnvVar(name='GITPAHT', value=GITPAHT))
            data_preparation.container.add_env_variable(V1EnvVar(name='GITDIR', value=GITDIR))
            data_preparation.container.add_env_variable(V1EnvVar(name='MINIOPATH', value=MINIOPATH))
            data_preparation.container.add_env_variable(V1EnvVar(name='DATAPATH', value=DATAPATH))
        else:
            vop = dsl.VolumeOp(
                name="create_pvc",
                storage_class="rook-ceph-fs",
                resource_name="pipeline-pvc",
                modes=dsl.VOLUME_MODE_RWM,
                size="1Gi"
            )
        
            data_preparation = dsl.ContainerOp(
                name="data_preparation",
                image="aiven86/git",
                command=["git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines"],
            ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
            data_preparation.after(vop)

    validation = dataflow_tf_data_validation_op(
        inference_data=train,
        validation_data=evaluation,
        column_names=column_names,
        key_columns=key_columns,
        gcp_project=project,
        run_mode=mode,
        validation_output=output_template,
    )
    if platform == 'onprem':
        validation.after(data_preparation)

    preprocess = dataflow_tf_transform_op(
        training_data_file_pattern=train,
        evaluation_data_file_pattern=evaluation,
        schema=validation.outputs['schema'],
        gcp_project=project,
        run_mode=mode,
        preprocessing_module=preprocess_module,
        transformed_data_dir=output_template
    )

    training = tf_train_op(
        transformed_data_dir=preprocess.output,
        schema=validation.outputs['schema'],
        learning_rate=learning_rate,
        hidden_layer_size=hidden_layer_size,
        steps=steps,
        target='tips',
        preprocessing_module=preprocess_module,
        training_output_dir=output_template
    )

    analysis = dataflow_tf_model_analyze_op(
        model=training.output,
        evaluation_data=evaluation,
        schema=validation.outputs['schema'],
        gcp_project=project,
        run_mode=mode,
        slice_columns=analyze_slice_column,
        analysis_results_dir=output_template
    )

    prediction = dataflow_tf_predict_op(
        data_file_pattern=evaluation,
        schema=validation.outputs['schema'],
        target_column='tips',
        model=training.output,
        run_mode=mode,
        gcp_project=project,
        predictions_dir=output_template
    )

    cm = confusion_matrix_op(
        predictions=prediction.output,
        target_lambda=target_lambda,
        output_dir=output_template
    )

    roc = roc_op(
        predictions_dir=prediction.output,
        target_lambda=target_class_lambda,
        output_dir=output_template
    )

    if platform == 'GCP' or storage == 'minio':
        deploy = kubeflow_deploy_op(
            model_dir=str(training.output) + '/export/export',
            server_name=tf_server_name
        )
    elif platform == 'onprem' and storage != 'minio':
        deploy = kubeflow_deploy_op(
            cluster_name=project,
            model_dir=str(training.output) + '/export/export',
            pvc_name=vop.outputs["name"],
            server_name=tf_server_name
        )

    steps = [validation, preprocess, training, analysis, prediction, cm, roc, deploy]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        elif platform == 'onprem':
            if storage == 'minio':
                step.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value=AWS_ACCESS_KEY_ID))
                step.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value=AWS_SECRET_ACCESS_KEY))
                step.container.add_env_variable(V1EnvVar(name='AWS_REGION', value=AWS_REGION))
                step.container.add_env_variable(V1EnvVar(name='S3_ENDPOINT', value=S3_ENDPOINT))
                step.container.add_env_variable(V1EnvVar(name='S3_USE_HTTPS', value=S3_USE_HTTPS))
                step.container.add_env_variable(V1EnvVar(name='S3_VERIFY_SSL', value=S3_VERIFY_SSL))
            else:
                step.apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
Beispiel #21
0
def mnist_pipeline(experiment_name='mnist',
                   namespace='kubeflow',
                   gs_bucket='gs://your-bucket/export',
                   epochs=10,
                   batch_size=128,
                   model_dir='gs://your-bucket/export',
                   model_name='dummy',
                   server_name='dummy'):

    objectiveConfig = {
        "type": "maximize",
        "goal": 0.85,
        "objectiveMetricName": "accuracy",
    }
    algorithmConfig = {"algorithmName": "random"}
    parameters = [
        {
            "name": "--learning_rate",
            "parameterType": "double",
            "feasibleSpace": {
                "min": "0.001",
                "max": "0.003"
            }
        },
        {
            "name": "--batch_size",
            "parameterType": "discrete",
            "feasibleSpace": {
                "list": ["512", "1024", "2048"]
            }
        },
    ]
    rawTemplate = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "TFJob",
        "metadata": {
            "name": "{{.Trial}}",
            "namespace": "{{.NameSpace}}"
        },
        "spec": {
            "tfReplicaSpecs": {
                "Worker": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "spec": {
                            "containers": [{
                                "name":
                                "tensorflow",
                                "image":
                                "gcr.io/dais-data-dev-txwj/mnist/train:latest",
                                "imagePullPolicy":
                                "Always",
                                "command": ["sh", "-c"],
                                "args": [
                                    "python /train.py --epochs 1 --save_model 0  {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
                                ]
                            }]
                        }
                    }
                }
            }
        }
    }

    trialTemplate = {"goTemplate": {"rawTemplate": json.dumps(rawTemplate)}}

    katib_experiment_launcher_op = components.load_component_from_url(
        'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml'
    )
    katibOp = katib_experiment_launcher_op(experiment_name=experiment_name,
                                           experiment_namespace=namespace,
                                           parallel_trial_count=3,
                                           max_trial_count=12,
                                           objective=str(objectiveConfig),
                                           algorithm=str(algorithmConfig),
                                           trial_template=str(trialTemplate),
                                           parameters=str(parameters),
                                           delete_finished_experiment=False)

    train_args = [
        '--bucket_name',
        gs_bucket,
        '--epochs',
        epochs,
    ]

    convert_op = func_to_container_op(convert_mnist_experiment_result)
    op2 = convert_op(katibOp.output, gs_bucket, epochs)

    train = dsl.ContainerOp(
        name='train',
        image='gcr.io/dais-data-dev-txwj/mnist/train:latest',
        arguments=op2.output)

    serve_args = [
        '--model_path', model_dir, '--model_name', model_name, '--server_name',
        server_name
    ]

    serve = dsl.ContainerOp(
        name='serve',
        image='gcr.io/dais-data-dev-txwj/mnist/serve:latest',
        arguments=serve_args)

    steps = [katibOp, op2, train, serve]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))

    op2.after(katibOp)
    train.after(op2)
    serve.after(train)
    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats, transform,
            trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            'tfx_image': 'tensorflow/tfx:0.14.0rc1'
        },
        log_root='/var/tmp/tfx/logs',
    )


if __name__ == '__main__':
    mount_volume_op = onprem.mount_pvc(
        "tfx-pvc",
        "tfx-pv",
        _tfx_root)
    config = KubeflowDagRunnerConfig(
        pipeline_operator_funcs=[mount_volume_op]
    )
    _pipeline = _create_pipeline(
        pipeline_name=_pipeline_name,
        pipeline_root=_pipeline_root,
        data_root=os.path.join(_pipeline_root, 'data'),
        module_file=_module_file,
        serving_model_dir=_serving_model_dir,
        )
    KubeflowRunner(config=config).run(_pipeline)
Beispiel #23
0
        eval_steps=100,
    )

    runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
        kubeflow_metadata_config=metadata_config,
        # Specify custom docker image to use.
        tfx_image=tfx_image,
        pipeline_operator_funcs=(
            # If running on K8s Engine (GKE) on Google Cloud Platform (GCP),
            # kubeflow_dag_runner.get_default_pipeline_operator_funcs()
            # provides default configurations specifically for GKE on GCP,
            # such as secrets.
            kubeflow_dag_runner.get_default_pipeline_operator_funcs()
            + [
                onprem.mount_pvc(
                    persistent_volume_claim,
                    persistent_volume,
                    persistent_volume_mount,
                )
            ]
        ),
    )

    p = init_kubeflow_pipeline(components, output_base, direct_num_workers=0)
    output_filename = f"{pipeline_name}.yaml"
    kubeflow_dag_runner.KubeflowDagRunner(
        config=runner_config,
        output_dir=output_dir,
        output_filename=output_filename,
    ).run(p)
Beispiel #24
0
               value='https://s3.us.cloud-object-storage.appdomain.cloud'):
    def _env_params(task):
        from kubernetes import client as k8s_client
        return (task.add_env_variable(
            k8s_client.V1EnvVar(name=name, value=value)))

    return _env_params


s3_endpoint_params = env_params('S3_ENDPOINT', 'minio-service.kubeflow:9000')
s3_use_https_params = env_params('S3_USE_HTTPS', '0')
s3_verify_ssl_params = env_params('S3_VERIFY_SSL', '0')
access_key_id_params = env_params('AWS_ACCESS_KEY_ID', 'minio')
secret_access_key_params = env_params('AWS_SECRET_ACCESS_KEY', 'minio123')

mount_volume_op = onprem.mount_pvc('tfx-volume', 'shared-volume',
                                   _output_bucket)


def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text,
                          taxi_module_file: Text, output_bucket: Text,
                          enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
Beispiel #25
0
def mnist_pipeline(model_export_dir='/mnt/export',
                   train_steps='1000',
                   learning_rate='0.01',
                   batch_size='100',
                   output='/mnt'):
    """
  Pipeline with three stages:
    1. train an MNIST classifier
    2. deploy a tf-serving instance to the cluster
    3. deploy a web-ui to interact with it
  """

    if platform != 'GCP':
        vop = dsl.VolumeOp(name="create_pvc",
                           storage_class="rook-ceph-fs",
                           resource_name="pipeline-pvc",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="1Gi")
        pvc_name = vop.outputs["name"]

        download = dsl.ContainerOp(
            name="download_data",
            image="aiven86/git",
            command=[
                "git", "clone",
                "https://github.com/cdyangzhenyu/mnist-data.git",
                str(output) + "/data"
            ],
        ).apply(onprem.mount_pvc(pvc_name, 'local-storage', output))
        download.after(vop)

    train = dsl.ContainerOp(name='train',
                            image='aiven86/tensorflow-mnist-kfp:1.13.1-gpu',
                            arguments=[
                                "/opt/model.py", "--tf-data-dir",
                                str(output) + "/data", "--tf-export-dir",
                                model_export_dir, "--tf-train-steps",
                                train_steps, "--tf-batch-size", batch_size,
                                "--tf-learning-rate", learning_rate
                            ]).add_resource_limit("aliyun.com/gpu-mem", 1)
    train.after(download)

    serve_args = [
        '--model-export-path', model_export_dir, '--server-name',
        "mnist-service"
    ]
    if platform != 'GCP':
        serve_args.extend(
            ['--cluster-name', "mnist-pipeline", '--pvc-name', pvc_name])

    serve = dsl.ContainerOp(
        name='serve',
        image='aiven86/ml-pipeline_ml-pipeline-kubeflow-deployer:'
        '7775692adf28d6f79098e76e839986c9ee55dd61',
        arguments=serve_args)
    serve.after(train)

    webui_args = [
        '--image', 'aiven86/kubeflow-examples_mnist_web-ui:'
        'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui',
        '--container-port', '5000', '--service-port', '80', '--service-type',
        "NodePort"
    ]
    if platform != 'GCP':
        webui_args.extend(['--cluster-name', "mnist-pipeline"])

    web_ui = dsl.ContainerOp(
        name='web-ui',
        image='aiven86/kubeflow-examples_mnist_deploy-service:latest',
        arguments=webui_args).set_image_pull_policy('IfNotPresent')
    web_ui.after(serve)

    steps = [train, serve, web_ui]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(onprem.mount_pvc(pvc_name, 'local-storage', output))
Beispiel #26
0
def taxi_cab_classification(
        pvc_size='1Gi',
        project='tfx-taxi-pipeline-on-prem',
        column_names='pipelines/samples/tfx/taxi-cab-classification/column-names.json',
        key_columns='trip_start_timestamp',
        train='pipelines/samples/tfx/taxi-cab-classification/train.csv',
        evaluation='pipelines/samples/tfx/taxi-cab-classification/eval.csv',
        mode='local',
        preprocess_module='pipelines/samples/tfx/taxi-cab-classification/preprocessing.py',
        learning_rate=0.1,
        hidden_layer_size=1500,
        steps=3000,
        analyze_slice_column='trip_start_hour'):

    tf_server_name = 'taxi-cab-classification-model-{{workflow.name}}'

    vop = dsl.VolumeOp(name='create-volume',
                       resource_name='taxi-cab-data',
                       modes=dsl.VOLUME_MODE_RWM,
                       size=pvc_size)

    checkout = dsl.ContainerOp(
        name="checkout",
        image="alpine/git:latest",
        command=[
            "git", "clone", "https://github.com/kubeflow/pipelines.git",
            "/mnt/pipelines"
        ],
    ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', "/mnt"))
    checkout.after(vop)

    validation = dataflow_tf_data_validation_op('/mnt/%s' % train,
                                                '/mnt/%s' % evaluation,
                                                '/mnt/%s' % column_names,
                                                key_columns, project, mode,
                                                '/mnt', vop.volume)
    validation.after(checkout)

    preprocess = dataflow_tf_transform_op('/mnt/%s' % train,
                                          '/mnt/%s' % evaluation,
                                          validation.outputs['schema'],
                                          project, mode,
                                          '/mnt/%s' % preprocess_module,
                                          '/mnt', vop.volume)

    training = tf_train_op(preprocess.output, validation.outputs['schema'],
                           learning_rate, hidden_layer_size, steps, 'tips',
                           '/mnt/%s' % preprocess_module, '/mnt', vop.volume)

    analysis = dataflow_tf_model_analyze_op(
        training.output, '/mnt/%s' % evaluation, validation.outputs['schema'],
        project, mode, analyze_slice_column, '/mnt', vop.volume)

    prediction = dataflow_tf_predict_op('/mnt/%s' % evaluation,
                                        validation.outputs['schema'], 'tips',
                                        training.output, mode, project, '/mnt',
                                        vop.volume)

    cm = confusion_matrix_op(prediction.output, '/mnt', vop.volume)

    roc = roc_op(prediction.output, '/mnt', vop.volume)

    deploy = kubeflow_deploy_op(training.output, tf_server_name, vop.output,
                                {'/mnt': vop.volume})
Beispiel #27
0
    )

    # This pipeline automatically injects the Kubeflow TFX image if the
    # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx
    # cli tool exports the environment variable to pass to the pipelines.
    tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None)

    runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
        kubeflow_metadata_config=metadata_config,
        # Specify custom docker image to use.
        tfx_image=tfx_image,
        pipeline_operator_funcs=(
            # If running on K8s Engine (GKE) on Google Cloud Platform (GCP),
            # kubeflow_dag_runner.get_default_pipeline_operator_funcs() provides
            # default configurations specifically for GKE on GCP, such as secrets.
            [
                onprem.mount_pvc(_persistent_volume_claim, _persistent_volume,
                                 _persistent_volume_mount)
            ]))

    kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run(
        _create_pipeline(
            pipeline_name=_pipeline_name,
            pipeline_root=_pipeline_root,
            data_root=_data_root,
            module_file=_module_file,
            serving_model_dir=_serving_model_dir,
            # 0 means auto-detect based on on the number of CPUs available during
            # execution time.
            direct_num_workers=0))
def ai_training_run(
    # Define variables that the user can set in the pipelines UI; set default values
    ontap_cluster_mgmt_hostname: str = "10.61.188.40", 
    ontap_cluster_admin_acct_k8s_secret: str = "ontap-cluster-mgmt-account",
    ontap_api_verify_ssl_cert: bool = True,
    dataset_volume_pvc_existing: str = "dataset-vol",
    dataset_volume_pv_existing: str = "pvc-43b12235-f32e-4dc4-a7b8-88e90d935a12",
    trained_model_volume_pvc_existing: str = "kfp-model-vol",
    trained_model_volume_pv_existing: str = "pvc-236e893b-63b4-40d3-963b-e709b9b2816b",
    execute_data_prep_step__yes_or_no: str = "yes",
    data_prep_step_container_image: str = "ubuntu:bionic",
    data_prep_step_command: str = "<insert command here>",
    data_prep_step_dataset_volume_mountpoint: str = "/mnt/dataset",
    train_step_container_image: str = "nvcr.io/nvidia/tensorflow:19.12-tf1-py3",
    train_step_command: str = "<insert command here>",
    train_step_dataset_volume_mountpoint: str = "/mnt/dataset",
    train_step_model_volume_mountpoint: str = "/mnt/model",
    validation_step_container_image: str = "nvcr.io/nvidia/tensorflow:19.12-tf1-py3",
    validation_step_command: str = "<insert command here>",
    validation_step_dataset_volume_mountpoint: str = "/mnt/dataset",
    validation_step_model_volume_mountpoint: str = "/mnt/model"
) :
    # Set GPU limits; Due to SDK limitations, this must be hardcoded
    train_step_num_gpu = 0
    validation_step_num_gpu = 0

    # Pipeline Steps:

    # Execute data prep step
    with dsl.Condition(execute_data_prep_step__yes_or_no == "yes") :
        data_prep = dsl.ContainerOp(
            name="data-prep",
            image=data_prep_step_container_image,
            command=["sh", "-c"],
            arguments=[data_prep_step_command]
        )
        # Mount dataset volume/pvc
        data_prep.apply(
            onprem.mount_pvc(dataset_volume_pvc_existing, 'dataset', data_prep_step_dataset_volume_mountpoint)
        )

    # Create a snapshot of the dataset volume/pvc for traceability
    dataset_snapshot = NetappSnapshotOp(
        ontap_cluster_mgmt_hostname, 
        dataset_volume_pv_existing,
        ontap_api_verify_ssl_cert
    )
    # Mount k8s secret containing ONTAP cluster admin account details
    dataset_snapshot.add_pvolumes({
        '/mnt/secret': k8s_client.V1Volume(
            name='ontap-cluster-admin',
            secret=k8s_client.V1SecretVolumeSource(
                secret_name=ontap_cluster_admin_acct_k8s_secret
            )
        )
    })
    # State that snapshot should be created after the data prep job completes
    dataset_snapshot.after(data_prep)

    # Execute training step
    train = dsl.ContainerOp(
        name="train-model",
        image=train_step_container_image,
        command=["sh", "-c"],
        arguments=[train_step_command]
    )
    # Mount dataset volume/pvc
    train.apply(
        onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', train_step_dataset_volume_mountpoint)
    )
    # Mount model volume/pvc
    train.apply(
        onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', train_step_model_volume_mountpoint)
    )
    # Request that GPUs be allocated to training job pod
    if train_step_num_gpu > 0 :
        train.set_gpu_limit(train_step_num_gpu, 'nvidia')
    # State that training job should be executed after dataset volume snapshot is taken
    train.after(dataset_snapshot)

    # Create a snapshot of the model volume/pvc for model versioning
    model_snapshot = NetappSnapshotOp(
        ontap_cluster_mgmt_hostname, 
        trained_model_volume_pv_existing,
        ontap_api_verify_ssl_cert
    )
    # Mount k8s secret containing ONTAP cluster admin account details
    model_snapshot.add_pvolumes({
        '/mnt/secret': k8s_client.V1Volume(
            name='ontap-cluster-admin',
            secret=k8s_client.V1SecretVolumeSource(
                secret_name=ontap_cluster_admin_acct_k8s_secret
            )
        )
    })
    # State that snapshot should be created after the training job completes
    model_snapshot.after(train)

    # Execute inference validation job
    inference_validation = dsl.ContainerOp(
        name="validate-model",
        image=validation_step_container_image,
        command=["sh", "-c"],
        arguments=[validation_step_command]
    )
    # Mount dataset volume/pvc
    inference_validation.apply(
        onprem.mount_pvc(dataset_volume_pvc_existing, 'datavol', validation_step_dataset_volume_mountpoint)
    )
    # Mount model volume/pvc
    inference_validation.apply(
        onprem.mount_pvc(trained_model_volume_pvc_existing, 'modelvol', validation_step_model_volume_mountpoint)
    )
    # Request that GPUs be allocated to pod
    if validation_step_num_gpu > 0 :
        inference_validation.set_gpu_limit(validation_step_num_gpu, 'nvidia')
    # State that inference validation job should be executed after model volume snapshot is taken
    inference_validation.after(model_snapshot)
Beispiel #29
0
def taxi_cab_classification(
        output,
        project,
        column_names='gs://ml-pipeline-playground/tfx/taxi-cab-classification/column-names.json',
        key_columns='trip_start_timestamp',
        train='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv',
        evaluation='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv',
        mode='local',
        preprocess_module='gs://ml-pipeline-playground/tfx/taxi-cab-classification/preprocessing.py',
        learning_rate=0.1,
        hidden_layer_size='1500',
        steps=3000,
        analyze_slice_column='trip_start_hour'):
    output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data'
    target_lambda = """lambda x: (x['target'] > x['fare'] * 0.2)"""
    target_class_lambda = """lambda x: 1 if (x['target'] > x['fare'] * 0.2) else 0"""

    tf_server_name = 'taxi-cab-classification-model-{{workflow.uid}}'

    if platform != 'GCP':
        vop = dsl.VolumeOp(name="create_pvc",
                           resource_name="pipeline-pvc",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="1Gi")

        checkout = dsl.ContainerOp(
            name="checkout",
            image="alpine/git:latest",
            command=[
                "git", "clone", "https://github.com/kubeflow/pipelines.git",
                str(output) + "/pipelines"
            ],
        ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
        checkout.after(vop)

    validation = dataflow_tf_data_validation_op(
        inference_data=train,
        validation_data=evaluation,
        column_names=column_names,
        key_columns=key_columns,
        gcp_project=project,
        run_mode=mode,
        validation_output=output_template,
    )
    if platform != 'GCP':
        validation.after(checkout)

    preprocess = dataflow_tf_transform_op(
        training_data_file_pattern=train,
        evaluation_data_file_pattern=evaluation,
        schema=validation.outputs['schema'],
        gcp_project=project,
        run_mode=mode,
        preprocessing_module=preprocess_module,
        transformed_data_dir=output_template)

    training = tf_train_op(transformed_data_dir=preprocess.output,
                           schema=validation.outputs['schema'],
                           learning_rate=learning_rate,
                           hidden_layer_size=hidden_layer_size,
                           steps=steps,
                           target='tips',
                           preprocessing_module=preprocess_module,
                           training_output_dir=output_template)

    analysis = dataflow_tf_model_analyze_op(
        model=training.output,
        evaluation_data=evaluation,
        schema=validation.outputs['schema'],
        gcp_project=project,
        run_mode=mode,
        slice_columns=analyze_slice_column,
        analysis_results_dir=output_template)

    prediction = dataflow_tf_predict_op(data_file_pattern=evaluation,
                                        schema=validation.outputs['schema'],
                                        target_column='tips',
                                        model=training.output,
                                        run_mode=mode,
                                        gcp_project=project,
                                        predictions_dir=output_template)

    cm = confusion_matrix_op(predictions=prediction.output,
                             target_lambda=target_lambda,
                             output_dir=output_template)

    roc = roc_op(predictions_dir=prediction.output,
                 target_lambda=target_class_lambda,
                 output_dir=output_template)

    if platform == 'GCP':
        deploy = kubeflow_deploy_op(model_dir=str(training.output) +
                                    '/export/export',
                                    server_name=tf_server_name)
    else:
        deploy = kubeflow_deploy_op(cluster_name=project,
                                    model_dir=str(training.output) +
                                    '/export/export',
                                    pvc_name=vop.outputs["name"],
                                    server_name=tf_server_name)

    steps = [
        validation, preprocess, training, analysis, prediction, cm, roc, deploy
    ]
    for step in steps:
        if platform == 'GCP':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.apply(
                onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def face_recognition(
    train_steps='30',
    learning_rate='-1',
    batch_size='1000',
    dataset_dir='/dataset',
    output_dir='/output',
    public_ip='10.1.0.15',
):
    """
  Pipeline with three stages:
    1. prepare the face recognition align dataset CASIA-WebFace
    2. train an facenet classifier model
    3. deploy a tf-serving instance to the cluster
    4. deploy a web-ui to interact with it
  """

    if platform == 'onprem':
        data_vop = dsl.VolumeOp(name="prepare_data_vop",
                                storage_class="rook-ceph-fs",
                                resource_name="data-pvc",
                                modes=dsl.VOLUME_MODE_RWM,
                                size="10Gi")
        data_pvc_name = data_vop.outputs["name"]

        output_vop = dsl.VolumeOp(name="prepare_output_vop",
                                  storage_class="csi-s3",
                                  resource_name="output-pvc",
                                  modes=dsl.VOLUME_MODE_RWM,
                                  size="1Gi")
        output_vop.after(data_vop)
        output_pvc_name = output_vop.outputs["name"]

    casia_align_data = str(
        dataset_dir) + "/data/casia_maxpy_mtcnnalign_182_160/"
    if is_aligned == 'True':
        raw_dataset = dsl.ContainerOp(
            name="raw_dataset",
            image="aiven86/facenet-dataset-casia-maxpy-clean:tail-2000",
            command=[
                "/bin/sh", "-c",
                "echo 'begin moving data';mv /data/ %s/;echo 'moving is finished';"
                % str(dataset_dir)
            ],
        ).apply(onprem.mount_pvc(data_pvc_name, 'dataset-storage',
                                 dataset_dir))
        raw_dataset.after(output_vop)
        casia_align_data = str(
            dataset_dir) + "/data/casia_maxpy_tail_2000_mtcnnalign_182_160"
        align_dataset_lfw = dsl.ContainerOp(
            name="align_dataset_lfw",
            image="aiven86/facenet-tensorflow:1.13.1-gpu-py3",
            command=[
                "/bin/sh", "-c",
                "python /facenet/src/align/align_dataset_mtcnn.py %s/data/lfw "
                "%s/data/lfw_mtcnnalign_160 --image_size 160 --margin 32 --random_order --gpu_memory_fraction 0.8"
                % (str(dataset_dir), str(dataset_dir))
            ],
        ).apply(onprem.mount_pvc(data_pvc_name, 'dataset-storage',
                                 dataset_dir))
        align_dataset_lfw.container.add_resource_limit("nvidia.com/gpu", 1)
        align_dataset_lfw.container.add_env_variable(
            V1EnvVar(name='PYTHONPATH', value=PYTHONPATH))
        align_dataset_lfw.after(raw_dataset)
        align_dataset = dsl.ContainerOp(
            name="align_dataset",
            image="aiven86/facenet-tensorflow:1.13.1-gpu-py3",
            command=[
                "/bin/sh", "-c",
                "python /facenet/src/align/align_dataset_mtcnn.py %s/data/CASIA-maxpy-clean-tail-2000 "
                "%s --image_size 182 --margin 44 --random_order --gpu_memory_fraction 0.8"
                % (str(dataset_dir), str(casia_align_data))
            ],
        ).add_resource_limit("nvidia.com/gpu", 1)
        align_dataset.after(align_dataset_lfw)
    else:
        align_dataset = dsl.ContainerOp(
            name="align_dataset",
            image="aiven86/facenet-dataset-casia-mtcnnalign:test",
            command=[
                "/bin/sh", "-c",
                "echo 'begin moving data';mv /data/ %s/;echo 'moving is finished';"
                % str(dataset_dir)
            ],
        )
        align_dataset.after(output_vop)

    train = dsl.ContainerOp(
        name='train',
        image='aiven86/facenet-tensorflow:1.13.1-gpu-py3',
        command=[
            "/bin/sh", "-c",
            "cd /facenet; python src/train_softmax.py --logs_base_dir %s/logs/facenet/ --models_base_dir %s/models/facenet/ "
            "--data_dir %s --image_size 160 --model_def models.inception_resnet_v1 "
            "--lfw_dir %s/data/lfw_mtcnnalign_160/ --optimizer ADAM --learning_rate %s --max_nrof_epochs %s --keep_probability 0.8 "
            "--random_crop --random_flip --use_fixed_image_standardization "
            "--learning_rate_schedule_file data/learning_rate_schedule_classifier_casia.txt --weight_decay 5e-4 "
            "--embedding_size 512 --lfw_distance_metric 1 --lfw_use_flipped_images --lfw_subtract_mean "
            "--validation_set_split_ratio 0.05 --validate_every_n_epochs 5 --prelogits_norm_loss_factor 5e-4 "
            "--epoch_size %s --gpu_memory_fraction 0.8; cp -r %s/logs %s/logs"
            % (str(dataset_dir), str(dataset_dir), str(casia_align_data),
               str(dataset_dir), learning_rate, train_steps, batch_size,
               str(dataset_dir), str(output_dir))
        ]).add_resource_limit("nvidia.com/gpu", 1)
    #.add_resource_limit("aliyun.com/gpu-mem", 2)
    train.after(align_dataset)

    transform_model = dsl.ContainerOp(
        name='transform_model',
        #file_outputs={'output': '/output.txt'},
        image='aiven86/facenet-tensorflow:1.13.1-gpu-py3',
        command=[
            "/bin/sh", "-c",
            "MODEL_DIR=`ls %s/models/facenet/`;cd /facenet;"
            "python src/freeze_graph.py %s/models/facenet/$MODEL_DIR %s/models/facenet/$MODEL_DIR/$MODEL_DIR.pb;"
            "cp -r %s/models %s/models;echo $MODEL_DIR > /output.txt;cat /output.txt"
            % (str(dataset_dir), str(dataset_dir), str(dataset_dir),
               str(dataset_dir), str(output_dir))
        ]).add_resource_limit("nvidia.com/gpu", 1)

    transform_model.after(train)

    ran_str = ''.join(random.sample('zyxwvutsrqponmlkjihgfedcba0123456789', 5))

    tf_serv_service_name = "face-recognition-service-" + ran_str
    cluster_name = "face-recognition-pipeline-" + ran_str

    serve_args = [
        '--model-export-path', "/mnt/models/facenet/", '--server-name',
        tf_serv_service_name
    ]
    if platform == 'onprem':
        serve_args.extend(
            ['--cluster-name', cluster_name, '--pvc-name', output_pvc_name])

    serve = dsl.ContainerOp(
        name='serve',
        image='aiven86/ml-pipeline_ml-pipeline-kubeflow-deployer:'
        '7775692adf28d6f79098e76e839986c9ee55dd61',
        arguments=serve_args)
    serve.after(transform_model)
    model_name = str(transform_model.output)

    tensorboard_args = [
        '--image',
        'tensorflow/tensorflow:1.13.1',
        '--name',
        'face-tensorboard-' + ran_str,
        '--container-port',
        '6006',
        '--service-port',
        '9000',
        '--service-type',
        "NodePort",
        '--pvc-name',
        output_pvc_name,
        '--cmd',
        '["/usr/local/bin/tensorboard","--logdir=/mnt/logs/facenet","--port=6006"]',
        '--public-ip',
        public_ip,
    ]
    if platform == 'onprem':
        tensorboard_args.extend(['--cluster-name', cluster_name])

    tensorboard = dsl.ContainerOp(
        name='tensorboard',
        image='aiven86/kubeflow-examples_face_deploy-service:tensorboard',
        arguments=tensorboard_args).set_image_pull_policy('IfNotPresent')
    tensorboard.after(serve)

    webui_args = [
        '--image',
        'aiven86/tf-face-recognition:1.0',
        '--name',
        'face-web-ui-' + ran_str,
        '--container-port',
        '5000',
        '--service-port',
        '5000',
        '--service-type',
        "NodePort",
        '--pvc-name',
        output_pvc_name,
        '--model-file-name',
        '/mnt/models/facenet/%s/%s.pb' % (model_name, model_name),
        '--tf-serving-host',
        tf_serv_service_name,
        '--public-ip',
        public_ip,
    ]
    if platform == 'onprem':
        webui_args.extend(['--cluster-name', cluster_name])

    web_ui = dsl.ContainerOp(
        name='web_ui',
        image='aiven86/kubeflow-examples_face_deploy-service:web-ui',
        arguments=webui_args).set_image_pull_policy('IfNotPresent')
    web_ui.after(serve)

    steps = [align_dataset, train, transform_model, serve, tensorboard, web_ui]
    for step in steps:
        step.apply(
            onprem.mount_pvc(data_pvc_name, 'dataset-storage', dataset_dir))
        step.apply(
            onprem.mount_pvc(output_pvc_name, 'output-storage', output_dir))
        if step in [align_dataset, train, transform_model]:
            step.container.add_env_variable(
                V1EnvVar(name='PYTHONPATH', value=PYTHONPATH))