Exemple #1
0
def default_train(
    resource_group,
    workspace,
    dataset
):
    """Pipeline steps"""

    operations = {}
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'

    exit_op = dsl.ContainerOp(
        name='Exit Handler',
        image="curlimages/curl",
        command=['curl'],
        arguments=[
            '-d', get_callback_payload(TRAIN_FINISH_EVENT),
            callback_url
        ]
    )

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501

        operations['start'] = dsl.ContainerOp(
            name='start',
            init_containers=[start_callback],
            image="busybox",
            command=['sh', '-c'],
            arguments=[
                'echo',
                'Pipeline starting'
            ]
        )

        operations['end'] = dsl.ContainerOp(
            name='End',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d', get_callback_payload("Model is registered"),
                callback_url
            ]
        )
        operations['end'].after(operations['start'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
              name='azure',
              persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                claim_name='azure-managed-file')
            )
        ).add_volume_mount(k8s_client.V1VolumeMount(
            mount_path='/mnt/azure', name='azure'))
def xgb_train_pipeline(
    output,
    project,
    region='us-central1',
    train_data='gs://ml-pipeline-playground/sfpd/train.csv',
    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
    schema='gs://ml-pipeline-playground/sfpd/schema.json',
    target='resolution',
    rounds=200,
    workers=2,
    true_label='ACTION',
):
  delete_cluster_op = DeleteClusterOp('delete-cluster', project, region).apply(gcp.use_gcp_secret('user-gcp-sa'))
  with dsl.ExitHandler(exit_op=delete_cluster_op):
    create_cluster_op = CreateClusterOp('create-cluster', project, region, output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    analyze_op = AnalyzeOp('analyze', project, region, create_cluster_op.output, schema,
                           train_data, '%s/{{workflow.name}}/analysis' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    transform_op = TransformOp('transform', project, region, create_cluster_op.output,
                               train_data, eval_data, target, analyze_op.output,
                               '%s/{{workflow.name}}/transform' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    train_op = TrainerOp('train', project, region, create_cluster_op.output, transform_op.outputs['train'],
                         transform_op.outputs['eval'], target, analyze_op.output, workers,
                         rounds, '%s/{{workflow.name}}/model' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    predict_op = PredictOp('predict', project, region, create_cluster_op.output, transform_op.outputs['eval'],
                           train_op.output, target, analyze_op.output, '%s/{{workflow.name}}/predict' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    confusion_matrix_op = ConfusionMatrixOp('confusion-matrix', predict_op.output,
                                            '%s/{{workflow.name}}/confusionmatrix' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))

    roc_op = RocOp('roc', predict_op.output, true_label, '%s/{{workflow.name}}/roc' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
def pipeline_exit_handler(message: str = 'Hello World!'):

    exit_task = print_op(message='Exit handler has worked!')

    with dsl.ExitHandler(exit_task):
        print_op(message=message)
        fail_op(message='Task failed.')
Exemple #4
0
def mnist_pipeline(learning_rate, dropout_rate, checkpoint_dir,
                   saved_model_dir, tensorboard_log):
    exit_task = echo_op("Done!")
    with dsl.ExitHandler(exit_task):
        vop = dsl.VolumeOp(name="mnist_model_volume",
                           resource_name="mnist_model",
                           storage_class="nfs-client",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="10Gi")

        mnist = dsl.ContainerOp(
            name='Mnist',
            image=
            'kubeflow-registry.default.svc.cluster.local:30000/katib-job:2B27615F',
            command=['python', '/app/mnist_to_pipeline.py'],
            arguments=[
                "--learning_rate", learning_rate, "--dropout_rate",
                dropout_rate, "--checkpoint_dir", checkpoint_dir,
                "--saved_model_dir", saved_model_dir, "--tensorboard_log",
                tensorboard_log
            ],
            pvolumes={"/result": vop.volume})

        result = dsl.ContainerOp(name='list_list',
                                 image='library/bash:4.4.23',
                                 command=['ls', '-R', '/result'],
                                 pvolumes={"/result": mnist.pvolume})

        mnist.after(vop)
        result.after(mnist)
Exemple #5
0
    def _create_pipeline_exit_handler(self):
        enable_volume_cleaning = (
            self.run_config.volume is not None
            and not self.run_config.volume.keep
        )

        if not enable_volume_cleaning:
            return contextlib.nullcontext()

        return dsl.ExitHandler(
            dsl.ContainerOp(
                name="schedule-volume-termination",
                image="gcr.io/cloud-builders/kubectl",
                command=[
                    "kubectl",
                    "delete",
                    "pvc",
                    "{{workflow.name}}-data-volume",
                    "--wait=false",
                    "--ignore-not-found",
                    "--output",
                    "name",
                ],
            )
        )
Exemple #6
0
def xgb_train_pipeline(
    output,
    project,
    region=dsl.PipelineParam('region', value='us-central1'),
    train_data=dsl.PipelineParam('train-data', value='gs://ml-pipeline-playground/sfpd/train.csv'),
    eval_data=dsl.PipelineParam('eval-data', value='gs://ml-pipeline-playground/sfpd/eval.csv'),
    schema=dsl.PipelineParam('schema', value='gs://ml-pipeline-playground/sfpd/schema.json'),
    target=dsl.PipelineParam('target', value='resolution'),
    rounds=dsl.PipelineParam('rounds', value=200),
    workers=dsl.PipelineParam('workers', value=2),
    true_label=dsl.PipelineParam('true-label', value='ACTION'),
):
  delete_cluster_op = DeleteClusterOp('delete-cluster', project, region)
  with dsl.ExitHandler(exit_op=delete_cluster_op):
    create_cluster_op = CreateClusterOp('create-cluster', project, region, output)

    analyze_op = AnalyzeOp('analyze', project, region, create_cluster_op.output, schema,
                           train_data, '%s/{{workflow.name}}/analysis' % output)

    transform_op = TransformOp('transform', project, region, create_cluster_op.output,
                               train_data, eval_data, target, analyze_op.output,
                               '%s/{{workflow.name}}/transform' % output)

    train_op = TrainerOp('train', project, region, create_cluster_op.output, transform_op.outputs['train'],
                         transform_op.outputs['eval'], target, analyze_op.output, workers,
                         rounds, '%s/{{workflow.name}}/model' % output)

    predict_op = PredictOp('predict', project, region, create_cluster_op.output, transform_op.outputs['eval'],
                           train_op.output, target, analyze_op.output, '%s/{{workflow.name}}/predict' % output)

    confusion_matrix_op = ConfusionMatrixOp('confusion-matrix', predict_op.output,
                                            '%s/{{workflow.name}}/confusionmatrix' % output)

    roc_op = RocOp('roc', predict_op.output, true_label, '%s/{{workflow.name}}/roc' % output)
Exemple #7
0
def pipeline_exit_handler(url='gs://ml-pipeline/shakespeare1.txt'):
    """A sample pipeline showing exit handler."""

    exit_task = echo_msg('exit!')

    with dsl.ExitHandler(exit_task):
        download_task = gcs_download_op(url)
        echo_task = print_file(download_task.output)
def download_and_print(url='gs://ml-pipeline-playground/shakespeare1.txt'):
    """A sample pipeline showing exit handler."""

    exit_task = echo_op('exit!')

    with dsl.ExitHandler(exit_task):
        download_task = gcs_download_op(url)
        echo_task = echo_op(download_task.output)
Exemple #9
0
def kfpipeline():

    exit_task = NewTask(handler='run_summary_comment')
    exit_task.with_params(workflow_id='{{workflow.uid}}',
                          repo=this_project.params.get('git_repo'),
                          issue=this_project.params.get('git_issue'))
    exit_task.with_secrets(
        'inline', {'GITHUB_TOKEN': this_project.get_secret('GITHUB_TOKEN')})
    with dsl.ExitHandler(funcs['git_utils'].as_step(exit_task,
                                                    name='exit-handler')):

        # run the ingestion function with the new image and params
        ingest = funcs['gen-iris'].as_step(name="get-data",
                                           handler='iris_generator',
                                           params={'format': 'pq'},
                                           outputs=[DATASET])

        # train with hyper-paremeters
        train = funcs["train"].as_step(
            name="train",
            params={
                "sample": -1,
                "label_column": LABELS,
                "test_size": 0.10
            },
            hyperparams={
                'model_pkg_class': [
                    "sklearn.ensemble.RandomForestClassifier",
                    "sklearn.linear_model.LogisticRegression",
                    "sklearn.ensemble.AdaBoostClassifier"
                ]
            },
            selector='max.accuracy',
            inputs={"dataset": ingest.outputs[DATASET]},
            labels={"commit": this_project.params.get('commit', '')},
            outputs=['model', 'test_set'])

        # test and visualize our model
        test = funcs["test"].as_step(name="test",
                                     params={"label_column": LABELS},
                                     inputs={
                                         "models_path": train.outputs['model'],
                                         "test_set": train.outputs['test_set']
                                     })

        # deploy our model as a serverless function
        deploy = funcs["serving"].deploy_step(
            models={f"{DATASET}_v1": train.outputs['model']},
            tag=this_project.params.get('commit', 'v1')[:6])

        # test out new model server (via REST API calls)
        tester = funcs["live_tester"].as_step(
            name='model-tester',
            params={
                'addr': deploy.outputs['endpoint'],
                'model': f"{DATASET}_v1"
            },
            inputs={'table': train.outputs['test_set']})
Exemple #10
0
def mnist_pipeline(learning_rate, dropout_rate, checkpoint_dir,
                   saved_model_dir, tensorboard_log):
    exit_task = echo_op("Done!")
    with dsl.ExitHandler(exit_task):
        vop = dsl.VolumeOp(name="mnist_model_volume",
                           resource_name="mnist_model",
                           storage_class="nfs-client",
                           modes=dsl.VOLUME_MODE_RWM,
                           size="10Gi")

        mnist = dsl.ContainerOp(
            name='Mnist',
            image=
            'kubeflow-registry.default.svc.cluster.local:30000/katib-job:2B27615F',
            command=['python', '/app/mnist_to_pipeline.py'],
            arguments=[
                "--learning_rate", learning_rate, "--dropout_rate",
                dropout_rate, "--checkpoint_dir", checkpoint_dir,
                "--saved_model_dir", saved_model_dir, "--tensorboard_log",
                tensorboard_log
            ],
            pvolumes={"/result": vop.volume},
            output_artifact_paths={
                'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
            },
            container_kwargs={
                'env': [
                    V1EnvVar('S3_ENDPOINT',
                             'minio-service.kubeflow.svc.cluster.local:9000'),
                    V1EnvVar(
                        'AWS_ENDPOINT_URL',
                        'http://minio-service.kubeflow.svc.cluster.local:9000'
                    ),
                    V1EnvVar('AWS_ACCESS_KEY_ID', 'minio'),
                    V1EnvVar('AWS_SECRET_ACCESS_KEY', 'minio123'),
                    V1EnvVar('AWS_REGION', 'us-east-1'),
                    V1EnvVar('S3_USE_HTTPS', '0'),
                    V1EnvVar('S3_VERIFY_SSL', '0'),
                ]
            })

        result = dsl.ContainerOp(name='list_list',
                                 image='library/bash:4.4.23',
                                 command=['ls', '-R', '/result'],
                                 pvolumes={"/result": mnist.pvolume})

        mnist.after(vop)
        result.after(mnist)

        arguments = {
            'learning_rate': '0.01',
            'dropout_rate': '0.2',
            'checkpoint_dir': '/reuslt/training_checkpoints',
            'model_version': '001',
            'saved_model_dir': '/result/saved_model',
            'tensorboard_log': '/result/log'
        }
Exemple #11
0
def xgb_train_pipeline(
    output,
    project,
    region='us-central1',
    train_data='gs://ml-pipeline-playground/sfpd/train.csv',
    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
    schema='gs://ml-pipeline-playground/sfpd/schema.json',
    target='resolution',
    rounds=200,
    workers=2,
    true_label='ACTION',
):
    output_template = str(
        output) + '/' + dsl.EXECUTION_ID_PLACEHOLDER + '/data'

    delete_cluster_op = dataproc_delete_cluster_op(project, region).apply(
        gcp.use_gcp_secret('user-gcp-sa'))

    with dsl.ExitHandler(exit_op=delete_cluster_op):
        create_cluster_op = dataproc_create_cluster_op(
            project, region, output).apply(gcp.use_gcp_secret('user-gcp-sa'))

        analyze_op = dataproc_analyze_op(project, region,
                                         create_cluster_op.output, schema,
                                         train_data, output_template).apply(
                                             gcp.use_gcp_secret('user-gcp-sa'))

        transform_op = dataproc_transform_op(
            project, region, create_cluster_op.output, train_data, eval_data,
            target, analyze_op.output,
            output_template).apply(gcp.use_gcp_secret('user-gcp-sa'))

        train_op = dataproc_train_op(project, region, create_cluster_op.output,
                                     transform_op.outputs['train'],
                                     transform_op.outputs['eval'], target,
                                     analyze_op.output, workers, rounds,
                                     output_template).apply(
                                         gcp.use_gcp_secret('user-gcp-sa'))

        predict_op = dataproc_predict_op(
            project, region, create_cluster_op.output,
            transform_op.outputs['eval'], train_op.output, target,
            analyze_op.output,
            output_template).apply(gcp.use_gcp_secret('user-gcp-sa'))

        confusion_matrix_task = confusion_matrix_op(
            predict_op.output,
            output_template).apply(gcp.use_gcp_secret('user-gcp-sa'))

        roc_task = roc_op(predictions_dir=predict_op.output,
                          true_class=true_label,
                          true_score_column=true_label,
                          output_dir=output_template).apply(
                              gcp.use_gcp_secret('user-gcp-sa'))
def save_most_frequent_word():
    exit_task = exit_op()
    with dsl.ExitHandler(exit_task):
        counter = frequent_word_op(message=message_param)
        counter.container.set_memory_request('200M')

        saver = save_message_op(message=counter.outputs['word'],
                                output_path=output_path_param)
        saver.container.set_cpu_limit('0.5')
        # saver.container.set_gpu_limit('2')
        saver.add_node_selector_constraint('kubernetes.io/os', 'linux')
def save_most_frequent_word():
    exit_op = ExitHandlerOp('exiting')
    with dsl.ExitHandler(exit_op):
        counter = GetFrequentWordOp(name='get-Frequent', message=message_param)
        counter.container.set_memory_request('200M')

        saver = SaveMessageOp(name='save',
                              message=counter.output,
                              output_path=output_path_param)
        saver.container.set_cpu_limit('0.5')
        # saver.container.set_gpu_limit('2')
        saver.add_node_selector_constraint('kubernetes.io/os', 'linux')
Exemple #14
0
def save_most_frequent_word(message: dsl.PipelineParam,
                            outputpath: dsl.PipelineParam):
    """A pipeline function describing the orchestration of the workflow."""

    exit_op = ExitHandlerOp('exiting')
    with dsl.ExitHandler(exit_op):
        counter = GetFrequentWordOp(name='get-Frequent', message=message)
        counter.set_memory_request('200M')

        saver = SaveMessageOp(name='save',
                              message=counter.output,
                              output_path=outputpath)
        saver.set_cpu_limit('0.5')
Exemple #15
0
def save_most_frequent_word():
    exit_op = ExitHandlerOp('exiting')
    with dsl.ExitHandler(exit_op):
        counter = GetFrequentWordOp(name='get-Frequent', message=message_param)
        counter.container.set_memory_request('200M')

        saver = SaveMessageOp(name='save',
                              message=counter.output,
                              output_path=output_path_param)
        saver.container.set_cpu_limit('0.5')
        saver.container.set_gpu_limit('2')
        saver.add_node_selector_constraint('cloud.google.com/gke-accelerator',
                                           'nvidia-tesla-k80')
        saver.apply(
            gcp.use_tpu(tpu_cores=8, tpu_resource='v2', tf_version='1.12'))
Exemple #16
0
def save_most_frequent_word(message: str, outputpath: str):
    """A pipeline function describing the orchestration of the workflow."""

    exit_op = ExitHandlerOp('exiting')
    with dsl.ExitHandler(exit_op):
        counter = GetFrequentWordOp(name='get-Frequent', message=message)
        counter.set_memory_request('200M')

        saver = SaveMessageOp(name='save',
                              message=counter.output,
                              output_path=outputpath)
        saver.set_cpu_limit('0.5')
        saver.set_gpu_limit('2')
        saver.add_node_selector_constraint('cloud.google.com/gke-accelerator',
                                           'nvidia-tesla-k80')
def mnist_pipeline(volume_size, learning_rate, dropout_rate, checkpoint_dir,
                   saved_model_dir, tensorboard_log, namespace, storage_uri,
                   name):
    exit_task = echo_op("Done!")
    with dsl.ExitHandler(exit_task):
        vop = dsl.VolumeOp(name='mnist_model',
                           resource_name='mnist_model',
                           storage_class="nfs-client",
                           modes=dsl.VOLUME_MODE_RWM,
                           size=volume_size)

        mnist = dsl.ContainerOp(
            name='Mnist',
            image=
            'kubeflow-registry.default.svc.cluster.local:30000/katib-job:FF61F3B',
            command=['python', '/app/Untitled.py'],
            arguments=[
                "--learning_rate", learning_rate, "--dropout_rate",
                dropout_rate, "--checkpoint_dir", checkpoint_dir,
                "--saved_model_dir", saved_model_dir, "--tensorboard_log",
                tensorboard_log
            ],
            pvolumes={"/result": vop.volume})

        result = dsl.ContainerOp(name='list_list',
                                 image='library/bash:4.4.23',
                                 command=['ls', '-R', '/result'],
                                 pvolumes={"/result": mnist.pvolume})
        kfserving = dsl.ContainerOp(
            name='kfserving',
            image=
            'kubeflow-registry.default.svc.cluster.local:30000/kfserving:6D7B836C',
            command=['python', '/app/kfserving-fairing.py'],
            arguments=[
                "--namespace", namespace, "--storage_uri",
                "pvc://" + str(vop.volume.persistent_volume_claim.claim_name) +
                str(storage_uri), "--name", name
            ],
            pvolumes={"/result": mnist.pvolume})
        mnist_web_ui = dsl.ContainerOp(
            name='mnist_web_ui',
            image='brightfly/kfserving-mnist-web-ui-deploy:latest',
        )

        mnist.after(vop)
        result.after(mnist)
        kfserving.after(mnist)
        mnist_web_ui.after(kfserving)
def download_and_print(url='gs://ml-pipeline-playground/shakespeare1.txt'):
    """A sample pipeline showing exit handler."""

    exit_op = dsl.ContainerOp(name='finally',
                              image='library/bash:4.4.23',
                              command=['echo', 'exit!'])

    with dsl.ExitHandler(exit_op):

        op1 = dsl.ContainerOp(
            name='download',
            image='google/cloud-sdk:216.0.0',
            command=['sh', '-c'],
            arguments=['gsutil cat %s | tee /tmp/results.txt' % url],
            file_outputs={'downloaded': '/tmp/results.txt'})

        op2 = dsl.ContainerOp(name='echo',
                              image='library/bash:4.4.23',
                              command=['sh', '-c'],
                              arguments=['echo %s' % op1.output])
Exemple #19
0
def email_pipeline(
    server_secret="server-secret",
    subject="Hi, again!",
    body="Tekton email",
    sender="*****@*****.**",
    recipients="[email protected], [email protected]",
    attachment_filepath="/tmp/data/output.txt"
):
    email = email_op(server_secret=server_secret,
                     subject=subject,
                     body=body,
                     sender=sender,
                     recipients=recipients,
                     attachment_path=attachment_filepath)
    email.add_env_variable(env_from_secret('USER', '$(params.server_secret)', 'user'))
    email.add_env_variable(env_from_secret('PASSWORD', '$(params.server_secret)', 'password'))
    email.add_env_variable(env_from_secret('TLS', '$(params.server_secret)', 'tls'))
    email.add_env_variable(env_from_secret('SERVER', '$(params.server_secret)', 'url'))
    email.add_env_variable(env_from_secret('PORT', '$(params.server_secret)', 'port'))
    email.apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path))

    with dsl.ExitHandler(email):
        write_file_task = write_file(attachment_filepath).apply(onprem.mount_pvc('shared-pvc', 'shared-pvc', attachment_path))
Exemple #20
0
def flipcoin_exit_pipeline():
    exit_task = print_op('Exit handler has worked!')
    with dsl.ExitHandler(exit_task):
        flip = flip_coin_op()
        with dsl.Condition(flip.output == 'heads'):
            random_num_head = get_random_int_op(0, 9)
            with dsl.Condition(random_num_head.output > 5):
                print_op('heads and %s > 5!' % random_num_head.output)
            with dsl.Condition(random_num_head.output <= 5):
                print_op('heads and %s <= 5!' % random_num_head.output)

        with dsl.Condition(flip.output == 'tails'):
            random_num_tail = get_random_int_op(10, 19)
            with dsl.Condition(random_num_tail.output > 15):
                print_op('tails and %s > 15!' % random_num_tail.output)
            with dsl.Condition(random_num_tail.output <= 15):
                print_op('tails and %s <= 15!' % random_num_tail.output)

        with dsl.Condition(flip.output == 'tails'):
            fail_op(
                message=
                "Failing the run to demonstrate that exit handler still gets executed."
            )
Exemple #21
0
def my_pipeline(message: str = 'Hello World!'):
    exit_task = exit_op(user_input=message)

    with dsl.ExitHandler(exit_task, name='my-pipeline'):
        print_op(message=message)
def xgb_train_pipeline(
    output='gs://{{kfp-default-bucket}}',
    project='{{kfp-project-id}}',
    diagnostic_mode='HALT_ON_ERROR',
    rounds=5,
):
    output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
    region='us-central1'
    workers=2
    quota_check=[{'region':region,'metric':'CPUS','quota_needed':12.0}]
    train_data='gs://ml-pipeline/sample-data/sfpd/train.csv'
    eval_data='gs://ml-pipeline/sample-data/sfpd/eval.csv'
    schema='gs://ml-pipeline/sample-data/sfpd/schema.json'
    true_label='ACTION'
    target='resolution'
    required_apis='dataproc.googleapis.com'
    cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER

    # Current GCP pyspark/spark op do not provide outputs as return values, instead,
    # we need to use strings to pass the uri around.
    analyze_output = output_template
    transform_output_train = os.path.join(output_template, 'train', 'part-*')
    transform_output_eval = os.path.join(output_template, 'eval', 'part-*')
    train_output = os.path.join(output_template, 'train_output')
    predict_output = os.path.join(output_template, 'predict_output')
    
    _diagnose_me_op = diagnose_me_op(
        bucket=output,
        execution_mode=diagnostic_mode,
        project_id=project, 
        target_apis=required_apis,
        quota_check=quota_check)
    
    with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op(
        project_id=project,
        region=region,
        name=cluster_name
    )):
        _create_cluster_op = dataproc_create_cluster_op(
            project_id=project,
            region=region,
            name=cluster_name,
            initialization_actions=[
              os.path.join(_PYSRC_PREFIX,
                           'initialization_actions.sh'),
            ],
            image_version='1.5'
        ).after(_diagnose_me_op)

        _analyze_op = dataproc_analyze_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            schema=schema,
            train_data=train_data,
            output=output_template
        ).after(_create_cluster_op).set_display_name('Analyzer')

        _transform_op = dataproc_transform_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            train_data=train_data,
            eval_data=eval_data,
            target=target,
            analysis=analyze_output,
            output=output_template
        ).after(_analyze_op).set_display_name('Transformer')

        _train_op = dataproc_train_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            train_data=transform_output_train,
            eval_data=transform_output_eval,
            target=target,
            analysis=analyze_output,
            workers=workers,
            rounds=rounds,
            output=train_output
        ).after(_transform_op).set_display_name('Trainer')

        _predict_op = dataproc_predict_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            data=transform_output_eval,
            model=train_output,
            target=target,
            analysis=analyze_output,
            output=predict_output
        ).after(_train_op).set_display_name('Predictor')

        _cm_op = confusion_matrix_op(
            predictions=os.path.join(predict_output, 'part-*.csv'),
            output_dir=output_template
        ).after(_predict_op)

        _roc_op = roc_op(
            predictions_dir=os.path.join(predict_output, 'part-*.csv'),
            true_class=true_label,
            true_score_column=true_label,
            output_dir=output_template
        ).after(_predict_op)
def xgb_train_pipeline(
    output='gs://your-gcs-bucket',
    project='your-gcp-project',
    cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER,
    region='us-central1',
    train_data='gs://ml-pipeline-playground/sfpd/train.csv',
    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
    schema='gs://ml-pipeline-playground/sfpd/schema.json',
    target='resolution',
    rounds=200,
    workers=2,
    true_label='ACTION',
):
    output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'

    # Current GCP pyspark/spark op do not provide outputs as return values, instead,
    # we need to use strings to pass the uri around.
    analyze_output = output_template
    transform_output_train = os.path.join(output_template, 'train', 'part-*')
    transform_output_eval = os.path.join(output_template, 'eval', 'part-*')
    train_output = os.path.join(output_template, 'train_output')
    predict_output = os.path.join(output_template, 'predict_output')

    with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op(
        project_id=project,
        region=region,
        name=cluster_name
    )):
        _create_cluster_op = dataproc_create_cluster_op(
            project_id=project,
            region=region,
            name=cluster_name,
            initialization_actions=[
              os.path.join(_PYSRC_PREFIX,
                           'initialization_actions.sh'),
            ],
            image_version='1.2'
        )

        _analyze_op = dataproc_analyze_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            schema=schema,
            train_data=train_data,
            output=output_template
        ).after(_create_cluster_op).set_display_name('Analyzer')

        _transform_op = dataproc_transform_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            train_data=train_data,
            eval_data=eval_data,
            target=target,
            analysis=analyze_output,
            output=output_template
        ).after(_analyze_op).set_display_name('Transformer')

        _train_op = dataproc_train_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            train_data=transform_output_train,
            eval_data=transform_output_eval,
            target=target,
            analysis=analyze_output,
            workers=workers,
            rounds=rounds,
            output=train_output
        ).after(_transform_op).set_display_name('Trainer')

        _predict_op = dataproc_predict_op(
            project=project,
            region=region,
            cluster_name=cluster_name,
            data=transform_output_eval,
            model=train_output,
            target=target,
            analysis=analyze_output,
            output=predict_output
        ).after(_train_op).set_display_name('Predictor')

        _cm_op = confusion_matrix_op(
            predictions=os.path.join(predict_output, 'part-*.csv'),
            output_dir=output_template
        ).after(_predict_op)

        _roc_op = roc_op(
            predictions_dir=os.path.join(predict_output, 'part-*.csv'),
            true_class=true_label,
            true_score_column=true_label,
            output_dir=output_template
        ).after(_predict_op)

    dsl.get_pipeline_conf().add_op_transformer(
        gcp.use_gcp_secret('user-gcp-sa'))
Exemple #24
0
def tacosandburritos_train(resource_group,
                           workspace,
                           dataset,
                           mlflow_experiment_id,
                           azdocallbackinfo=None):

    exit_handler_op = exit_op(
        kfp_host_url="$(KFP_HOST)",
        azdocallbackinfo=azdocallbackinfo,
        run_id=dsl.RUN_ID_PLACEHOLDER,
        tenant_id="$(AZ_TENANT_ID)",
        service_principal_id="$(AZ_CLIENT_ID)",
        service_principal_password="******",
        pat_env="PAT_ENV").apply(use_azure_secret()).apply(
            use_kfp_host_secret()).apply(use_image(exit_image_name)).apply(
                use_secret_var("azdopat", "PAT_ENV", "azdopat"))

    with dsl.ExitHandler(exit_op=exit_handler_op):

        operations['mlflowproject'] = mlflow_project_op(
            mlflow_experiment_id=mlflow_experiment_id,  # noqa: E501
            kf_run_id=dsl.RUN_ID_PLACEHOLDER).apply(
                use_databricks_secret()).apply(
                    use_image(mlflow_project_image_name))  # noqa: E501

        operations['preprocess'] = preprocess_op(
            base_path=persistent_volume_path,  # noqa: E501
            training_folder=training_folder,  # noqa: E501
            target=training_dataset,
            image_size=image_size,
            zipfile=dataset).apply(
                use_image(preprocess_image_name))  # noqa: E501

        operations['preprocess'].after(
            operations['mlflowproject'])  # noqa: E501

        operations['training'] = train_op(base_path=persistent_volume_path,
                                          training_folder=training_folder,
                                          epochs=2,
                                          batch=batch,
                                          image_size=image_size,
                                          lr=0.0001,
                                          model_folder=model_folder,
                                          images=training_dataset,
                                          dataset=operations['preprocess'].outputs['dataset']). \
            set_memory_request('16G'). \
            add_env_variable(V1EnvVar(name="RUN_ID", value=dsl.RUN_ID_PLACEHOLDER)). \
            add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
            add_env_variable(V1EnvVar(name="GIT_PYTHON_REFRESH", value='quiet')). \
            apply(use_image(train_image_name))

        # Spot nodepool target
        # operations['training'].add_toleration(k8s_client.V1Toleration(
        #     key='kubernetes.azure.com/scalesetpriority',
        #     operator='Equal',
        #     value='spot',
        #     effect="NoSchedule"))

        # Virtual/ACI nodepool target
        # operations['training'].add_node_selector_constraint(
        #     label_name='type', value='virtual-kubelet')
        # operations['training'].add_toleration(k8s_client.V1Toleration(
        #     key='virtual-kubelet.io/provider', operator='Exists'))

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = evaluate_op(
            model=operations['training'].outputs['model'])
        operations['evaluate'].after(operations['training'])

        operations['register to AML'] = register_op(base_path=persistent_volume_path,
                                                    model_file='latest.h5',
                                                    model_name=model_name,
                                                    tenant_id='$(AZ_TENANT_ID)',
                                                    service_principal_id='$(AZ_CLIENT_ID)',
                                                    service_principal_password='******',
                                                    subscription_id='$(AZ_SUBSCRIPTION_ID)',
                                                    resource_group=resource_group,
                                                    workspace=workspace,
                                                    run_id=dsl.RUN_ID_PLACEHOLDER). \
            apply(use_azure_secret()). \
            apply(use_image(register_images_name))

        operations['register to AML'].after(operations['evaluate'])

        operations['register to mlflow'] = register_mlflow_op(model='model',
                                                              model_name=model_name,
                                                              experiment_name='mexicanfood',
                                                              run_id=dsl.RUN_ID_PLACEHOLDER). \
            apply(use_azure_secret()). \
            add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
            apply(use_image(register_mlflow_image_name))

        operations['register to mlflow'].after(operations['register to AML'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Exemple #25
0
def cnn_train(resource_group, workspace, dataset, token):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = dataset  # noqa: E501
    batch = 32
    model_name = 'cnnmodel'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "k8scc01covidmlopsacr.azurecr.io/mlops"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'
    mlflow_url = 'http://mlflow.mlflow:5000'

    exit_op = dsl.ContainerOp(name='Exit Handler',
                              image="curlimages/curl",
                              command=['curl'],
                              arguments=[
                                  '-d',
                                  get_callback_payload(TRAIN_FINISH_EVENT),
                                  callback_url
                              ])

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501

        operations['tensorflow preprocess'] = dsl.ContainerOp(
            name='tensorflow preprocess',
            init_containers=[start_callback],
            image=image_repo_name + '/tensorflow-preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--target', training_dataset,
                '--img_size', image_size, '--zipfile', data_download
            ])

        operations['tensorflow training'] = dsl.ContainerOp(
            name="tensorflow training",
            image=image_repo_name + '/tensorflow-training:latest',
            command=['python'],
            arguments=[
                '/scripts/train.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--epochs', 2, '--batch', batch,
                '--image_size', image_size, '--lr', 0.0001, '--outputs',
                model_folder, '--dataset', training_dataset
            ],
            output_artifact_paths={
                'mlpipeline-metrics': '/mlpipeline-metrics.json',
                'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
            }).apply(use_azstorage_secret()).add_env_variable(
                V1EnvVar(name="RUN_ID",
                         value=dsl.RUN_ID_PLACEHOLDER)).add_env_variable(
                             V1EnvVar(
                                 name="MLFLOW_TRACKING_TOKEN",
                                 value=token)).add_env_variable(
                                     V1EnvVar(
                                         name="MLFLOW_TRACKING_URI",
                                         value=mlflow_url)).add_env_variable(
                                             V1EnvVar(
                                                 name="GIT_PYTHON_REFRESH",
                                                 value='quiet'))  # noqa: E501

        operations['tensorflow training'].after(
            operations['tensorflow preprocess'])  # noqa: E501

        operations['evaluate'] = dsl.ContainerOp(
            name='evaluate',
            image="busybox",
            command=['sh', '-c'],
            arguments=['echo', 'Life is Good!'])
        operations['evaluate'].after(operations['tensorflow training'])

        operations['register kubeflow'] = dsl.ContainerOp(
            name='register kubeflow',
            image=image_repo_name + '/register-kubeflow-artifacts:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name, '--data',
                training_folder, '--dataset', training_dataset, '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register kubeflow'].after(operations['evaluate'])

        operations['register AML'] = dsl.ContainerOp(
            name='register AML',
            image=image_repo_name + '/register-aml:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)", '--service_principal_id',
                "$(AZ_CLIENT_ID)", '--service_principal_password',
                "$(AZ_CLIENT_SECRET)", '--subscription_id',
                "$(AZ_SUBSCRIPTION_ID)", '--resource_group', resource_group,
                '--workspace', workspace, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register AML'].after(operations['register kubeflow'])

        operations['register mlflow'] = dsl.ContainerOp(
            name='register mlflow',
            image=image_repo_name + '/register-mlflow:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--model', 'model', '--model_name',
                model_name, '--experiment_name', 'kubeflow-mlops', '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret()).add_env_variable(
                V1EnvVar(name="MLFLOW_TRACKING_URI",
                         value=mlflow_url)).add_env_variable(
                             V1EnvVar(name="MLFLOW_TRACKING_TOKEN",
                                      value=token))  # noqa: E501
        operations['register mlflow'].after(operations['register AML'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d',
                get_callback_payload("Model is registered"), callback_url
            ])
        operations['finalize'].after(operations['register mlflow'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Exemple #26
0
def ext_handler_pipeline():
    exit_op = print_op('Exit')

    with dsl.ExitHandler(exit_op):
        flip = flip_coin_op()
        print_op(flip.output)
Exemple #27
0
def tacosandburritos_train(resource_group, workspace, dataset):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = dataset  # noqa: E501
    batch = 32
    model_name = 'tacosandburritos'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "kubeflowyoacr.azurecr.io/mexicanfood"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'
    mlflow_url = 'http://mlflow:5000'

    exit_op = dsl.ContainerOp(name='Exit Handler',
                              image="curlimages/curl",
                              command=['curl'],
                              arguments=[
                                  '-d',
                                  get_callback_payload(TRAIN_FINISH_EVENT),
                                  callback_url
                              ])

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501

        operations['data processing on databricks'] = dsl.ContainerOp(
            name='data processing on databricks',
            init_containers=[start_callback],
            image=image_repo_name + '/databricks-notebook:latest',
            arguments=[
                '-r', dsl.RUN_ID_PLACEHOLDER, '-p',
                '{"argument_one":"param one","argument_two":"param two"}'
            ]).apply(use_databricks_secret())

        operations['preprocess'] = dsl.ContainerOp(
            name='preprocess',
            image=image_repo_name + '/preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--target', training_dataset,
                '--img_size', image_size, '--zipfile', data_download
            ])

        operations['preprocess'].after(
            operations['data processing on databricks'])  # noqa: E501

        #  train
        #  TODO: read set of parameters from config file
        with dsl.ParallelFor([{
                'epochs': 1,
                'lr': 0.0001
        }, {
                'epochs': 2,
                'lr': 0.0002
        }, {
                'epochs': 3,
                'lr': 0.0003
        }]) as item:  # noqa: E501
            operations['training'] = dsl.ContainerOp(
                name="training",
                image=image_repo_name + '/training:latest',
                command=['python'],
                arguments=[
                    '/scripts/train.py', '--base_path', persistent_volume_path,
                    '--data', training_folder, '--epochs', item.epochs,
                    '--batch', batch, '--image_size', image_size, '--lr',
                    item.lr, '--outputs', model_folder, '--dataset',
                    training_dataset
                ],
                output_artifact_paths=
                {  # change output_artifact_paths to file_outputs after this PR is merged https://github.com/kubeflow/pipelines/pull/2334 # noqa: E501
                    'mlpipeline-metrics': '/mlpipeline-metrics.json',
                    'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
                }).add_env_variable(
                    V1EnvVar(name="RUN_ID",
                             value=dsl.RUN_ID_PLACEHOLDER)).add_env_variable(
                                 V1EnvVar(name="MLFLOW_TRACKING_URI",
                                          value=mlflow_url)).add_env_variable(
                                              V1EnvVar(
                                                  name="GIT_PYTHON_REFRESH",
                                                  value='quiet'))  # noqa: E501

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = dsl.ContainerOp(
            name='evaluate',
            image="busybox",
            command=['sh', '-c'],
            arguments=['echo', 'Life is Good!'])
        operations['evaluate'].after(operations['training'])

        # register kubeflow artifcats model
        operations['register to kubeflow'] = dsl.ContainerOp(
            name='register to kubeflow',
            image=image_repo_name + '/registerartifacts:latest',
            command=['python'],
            arguments=[
                '/scripts/registerartifacts.py', '--base_path',
                persistent_volume_path, '--model', 'latest.h5', '--model_name',
                model_name, '--data', training_folder, '--dataset',
                training_dataset, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register to kubeflow'].after(operations['evaluate'])

        # register model
        operations['register to AML'] = dsl.ContainerOp(
            name='register to AML',
            image=image_repo_name + '/register:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)", '--service_principal_id',
                "$(AZ_CLIENT_ID)", '--service_principal_password',
                "$(AZ_CLIENT_SECRET)", '--subscription_id',
                "$(AZ_SUBSCRIPTION_ID)", '--resource_group', resource_group,
                '--workspace', workspace, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register to AML'].after(operations['register to kubeflow'])

        # register model to mlflow
        operations['register to mlflow'] = dsl.ContainerOp(
            name='register to mlflow',
            image=image_repo_name + '/register-mlflow:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--model', 'model', '--model_name',
                model_name, '--experiment_name', 'mexicanfood', '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret()).add_env_variable(
                V1EnvVar(name="MLFLOW_TRACKING_URI",
                         value=mlflow_url))  # noqa: E501
        operations['register to mlflow'].after(operations['register to AML'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d',
                get_callback_payload("Model is registered"), callback_url
            ])
        operations['finalize'].after(operations['register to mlflow'])

    # operations['deploy'] = dsl.ContainerOp(
    #     name='deploy',
    #     image=image_repo_name + '/deploy:latest',
    #     command=['sh'],
    #     arguments=[
    #         '/scripts/deploy.sh',
    #         '-n', model_name,
    #         '-m', model_name,
    #         '-t', "$(AZ_TENANT_ID)",
    #         '-r', resource_group,
    #         '-w', workspace,
    #         '-s', "$(AZ_CLIENT_ID)",
    #         '-p', "$(AZ_CLIENT_SECRET)",
    #         '-u', "$(AZ_SUBSCRIPTION_ID)",
    #         '-b', persistent_volume_path,
    #         '-x', dsl.RUN_ID_PLACEHOLDER
    #     ]
    # ).apply(use_azure_secret())
    # operations['deploy'].after(operations['register'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Exemple #28
0
def tacosandburritos_train(resource_group, workspace, dataset):

    exit_handler = exit_op(
        callback_url=callback_url,
        callback_payload=get_callback_payload(TRAIN_FINISH_EVENT))

    with dsl.ExitHandler(exit_handler):

        operations['data processing on databricks'] = databricks_op(run_id=dsl.RUN_ID_PLACEHOLDER,  # noqa: E501
                                                 notebook_params='{"argument_one":"param one","argument_two":"param two"}'  # noqa: E501
                                                 ).apply(use_databricks_secret()). \
                                                 add_init_container(get_start_callback_container()). \
                                                 apply(
                                                     use_image(databricks_image_name))

        operations['preprocess'] = preprocess_op(base_path=persistent_volume_path,  # noqa: E501
                                                 training_folder=training_folder,  # noqa: E501
                                                 target=training_dataset,
                                                 image_size=image_size,
                                                 zipfile=dataset). \
                                                 apply(
                                                     use_image(preprocess_image_name))

        operations['preprocess'].after(
            operations['data processing on databricks'])  # noqa: E501

        operations['training'] = train_op(base_path=persistent_volume_path,
                                          training_folder=training_folder,
                                          epochs=2,
                                          batch=batch,
                                          image_size=image_size,
                                          lr=0.0001,
                                          model_folder=model_folder,
                                          images=training_dataset,
                                          dataset=operations['preprocess'].outputs['dataset']). \
                                          set_memory_request('16G'). \
                                          add_env_variable(V1EnvVar(name="RUN_ID", value=dsl.RUN_ID_PLACEHOLDER)). \
                                          add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
                                          add_env_variable(V1EnvVar(name="GIT_PYTHON_REFRESH", value='quiet')).apply(use_image(train_image_name))  # noqa: E501, E127

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = evaluate_op(
            model=operations['training'].outputs['model'])
        operations['evaluate'].after(operations['training'])

        operations['register to AML'] = register_op(
            base_path=persistent_volume_path,
            model_file='latest.h5',
            model_name=model_name,
            tenant_id='$(AZ_TENANT_ID)',
            service_principal_id='$(AZ_CLIENT_ID)',
            service_principal_password='******',
            subscription_id='$(AZ_SUBSCRIPTION_ID)',
            resource_group=resource_group,
            workspace=workspace,
            run_id=dsl.RUN_ID_PLACEHOLDER).apply(use_azure_secret()).apply(
                use_image(register_images_name))  # noqa: E501, E127

        operations['register to AML'].after(operations['evaluate'])

        operations['register to mlflow'] = register_mlflow_op(model='model',
                                                              model_name=model_name,
                                                              experiment_name='mexicanfood',
                                                              run_id=dsl.RUN_ID_PLACEHOLDER).apply(use_azure_secret()). \
                                                              add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)).apply(use_image(register_mlflow_image_name))  # noqa: E501

        operations['register to mlflow'].after(operations['register to AML'])

        operations['finalize'] = finalize_op(
            callback_url=callback_url,
            callback_payload=get_callback_payload("Model is registered"))
        operations['finalize'].after(operations['register to mlflow'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Exemple #29
0
def tacosandburritos_train(
    resource_group,
    workspace
):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = 'https://aiadvocate.blob.core.windows.net/public/tacodata.zip'  # noqa: E501
    epochs = 2
    batch = 32
    learning_rate = 0.0001
    model_name = 'tacosandburritos'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "kubeflowyoacr.azurecr.io/mexicanfood"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'

    exit_op = dsl.ContainerOp(
        name='Exit Handler',
        image="curlimages/curl",
        command=['curl'],
        arguments=[
            '-d', get_callback_payload(TRAIN_FINISH_EVENT),
            callback_url
        ]
    )

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501
        operations['preprocess'] = dsl.ContainerOp(
            name='preprocess',
            init_containers=[start_callback],
            image=image_repo_name + '/preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py',
                '--base_path', persistent_volume_path,
                '--data', training_folder,
                '--target', training_dataset,
                '--img_size', image_size,
                '--zipfile', data_download
            ]
        )

        # train
        operations['training'] = dsl.ContainerOp(
            name='training',
            image=image_repo_name + '/training:latest',
            command=['python'],
            arguments=[
                '/scripts/train.py',
                '--base_path', persistent_volume_path,
                '--data', training_folder,
                '--epochs', epochs,
                '--batch', batch,
                '--image_size', image_size,
                '--lr', learning_rate,
                '--outputs', model_folder,
                '--dataset', training_dataset
            ]
        )
        operations['training'].after(operations['preprocess'])

        # register model
        operations['register'] = dsl.ContainerOp(
            name='register',
            image=image_repo_name + '/register:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py',
                '--base_path', persistent_volume_path,
                '--model', 'latest.h5',
                '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)",
                '--service_principal_id', "$(AZ_CLIENT_ID)",
                '--service_principal_password', "$(AZ_CLIENT_SECRET)",
                '--subscription_id', "$(AZ_SUBSCRIPTION_ID)",
                '--resource_group', resource_group,
                '--workspace', workspace,
                '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]
        ).apply(use_azure_secret())
        operations['register'].after(operations['training'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d', get_callback_payload("Model is registered"),
                callback_url
            ]
        )
        operations['finalize'].after(operations['register'])

    # operations['deploy'] = dsl.ContainerOp(
    #     name='deploy',
    #     image=image_repo_name + '/deploy:latest',
    #     command=['sh'],
    #     arguments=[
    #         '/scripts/deploy.sh',
    #         '-n', model_name,
    #         '-m', model_name,
    #         '-t', "$(AZ_TENANT_ID)",
    #         '-r', resource_group,
    #         '-w', workspace,
    #         '-s', "$(AZ_CLIENT_ID)",
    #         '-p', "$(AZ_CLIENT_SECRET)",
    #         '-u', "$(AZ_SUBSCRIPTION_ID)",
    #         '-b', persistent_volume_path,
    #         '-x', dsl.RUN_ID_PLACEHOLDER
    #     ]
    # ).apply(use_azure_secret())
    # operations['deploy'].after(operations['register'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
              name='azure',
              persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                claim_name='azure-managed-disk')
            )
        ).add_volume_mount(k8s_client.V1VolumeMount(
            mount_path='/mnt/azure', name='azure'))
Exemple #30
0
def mnist_pipeline(volume_size, learning_rate, dropout_rate, checkpoint_dir,
                   model_version, saved_model_dir, tensorboard_log, namespace,
                   storage_uri, name):
    exit_task = echo_op("Done!")
    with dsl.ExitHandler(exit_task):
        vop = dsl.VolumeOp(name='mnist_model',
                           resource_name='mnist_model',
                           storage_class="nfs-client",
                           modes=dsl.VOLUME_MODE_RWM,
                           size=volume_size)

        mnist = dsl.ContainerOp(
            name='Mnist',
            image=
            'kubeflow-registry.default.svc.cluster.local:30000/katib-job:8EA9F526',
            command=['python', '/app/save_model_mnist.py'],
            arguments=[
                "--learning_rate", learning_rate, "--dropout_rate",
                dropout_rate, "--checkpoint_dir", checkpoint_dir,
                "--model_version", model_version, "--saved_model_dir",
                saved_model_dir, "--tensorboard_log", tensorboard_log
            ],
            pvolumes={"/result": vop.volume},
            output_artifact_paths={
                'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
            },
            container_kwargs={
                'env': [
                    V1EnvVar('S3_ENDPOINT',
                             'minio-service.kubeflow.svc.cluster.local:9000'),
                    V1EnvVar(
                        'AWS_ENDPOINT_URL',
                        'http://minio-service.kubeflow.svc.cluster.local:9000'
                    ),
                    V1EnvVar('AWS_ACCESS_KEY_ID', 'minio'),
                    V1EnvVar('AWS_SECRET_ACCESS_KEY', 'minio123'),
                    V1EnvVar('AWS_REGION', 'us-east-1'),
                    V1EnvVar('S3_USE_HTTPS', '0'),
                    V1EnvVar('S3_VERIFY_SSL', '0'),
                ]
            })

        result = dsl.ContainerOp(name='list_list',
                                 image='library/bash:4.4.23',
                                 command=['ls', '-R', '/result'],
                                 pvolumes={"/result": mnist.pvolume})
        '''
        kfserving = dsl.ContainerOp(
            name='kfserving',
            image='kubeflow-registry.default.svc.cluster.local:30000/kfserving:D0BE75E',
            command=['python', '/app/kfserving_fairing.py'],
            arguments=[
                "--namespace", namespace,
                "--storage_uri", "pvc://" + str(vop.volume.persistent_volume_claim.claim_name) + str(storage_uri),
                "--name", name
            ],
            pvolumes={"/result": mnist.pvolume}
        )
        '''
        kfserving = kfserving_op(
            action='update',
            model_name=name,
            default_model_uri="pvc://" +
            str(vop.volume.persistent_volume_claim.claim_name) +
            str(storage_uri),
            canary_model_uri='',
            canary_model_traffic_percentage='0',
            namespace='kubeflow',
            framework='tensorflow',
            default_custom_model_spec='{}',
            canary_custom_model_spec='{}',
            autoscaling_target='0',
            kfserving_endpoint='')

        mnist_web_ui = dsl.ContainerOp(
            name='mnist_web_ui',
            image='brightfly/kfserving-mnist-web-ui-deploy:latest',
        )

        mnist.after(vop)
        result.after(mnist)
        kfserving.after(mnist)
        mnist_web_ui.after(kfserving)