Beispiel #1
0
 def test_use_azure_secret(self):
     op1 = ContainerOp(name='op1', image='image')
     op1 = op1.apply(use_azure_secret('foo'))
     assert len(op1.container.env) == 4
     
     index = 0
     for expected in ['AZ_SUBSCRIPTION_ID', 'AZ_TENANT_ID', 'AZ_CLIENT_ID', 'AZ_CLIENT_SECRET']:
         assert op1.container.env[index].name == expected
         assert op1.container.env[index].value_from.secret_key_ref.name == 'foo'
         assert op1.container.env[index].value_from.secret_key_ref.key == expected
         index += 1
Beispiel #2
0
def model_deploy(resource_group, workspace):

    operation = deploy_operation(deployment_name='deploymentname',
                                model_name='model_name:1',
                                tenant_id='$(AZ_TENANT_ID)',
                                service_principal_id='$(AZ_CLIENT_ID)',
                                service_principal_password='******',
                                subscription_id='$(AZ_SUBSCRIPTION_ID)',
                                resource_group=resource_group,
                                workspace=workspace,
                                inference_config='src/inferenceconfig.json',
                                deployment_config='src/deploymentconfig.json'). \
                                apply(use_azure_secret()). \
                                apply(use_image(deploy_image_name))
Beispiel #3
0
    def test_use_azure_secret(self):
        with Pipeline('somename') as p:
            op1 = ContainerOp(name='op1', image='image')
            op1 = op1.apply(use_azure_secret('azcreds'))
            assert len(op1.env_variables) == 4

            index = 0
            for expected in [
                    'AZ_SUBSCRIPTION_ID', 'AZ_TENANT_ID', 'AZ_CLIENT_ID',
                    'AZ_CLIENT_SECRET'
            ]:
                print(op1.env_variables[index].name)
                print(op1.env_variables[index].value_from.secret_key_ref.name)
                print(op1.env_variables[index].value_from.secret_key_ref.key)
                index += 1
Beispiel #4
0
def transformer(containerOp):
  containerOp.arguments = ['/scripts/pipelineWrapper.py', 'Privacy', 'python'] + containerOp.arguments
  # shouldn't hard code this experiment name
  
  containerOp.container.set_image_pull_policy("Always")
  containerOp.add_volume(
    k8s_client.V1Volume(
      name='azure',
      persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
        claim_name='azure-managed-disk')
    )
  ).add_volume_mount(k8s_client.V1VolumeMount(mount_path='/mnt/azure', name='azure'))

  containerOp.container.add_env_variable(V1EnvVar(name='AZ_NAME', value=ws.name))\
    .add_env_variable(V1EnvVar(name='AZ_SUBSCRIPTION_ID', value=ws.subscription_id))\
    .add_env_variable(V1EnvVar(name='AZ_RESOURCE_GROUP', value=ws.resource_group))
  containerOp.apply(use_azure_secret('azcreds'))

  return containerOp
Beispiel #5
0
def run_spark_job(main_definition_file, command_line_arguments):
    operation = run_job_operation(executor_size='Small',
                                executors=1,
                                main_class_name='""',
                                main_definition_file=main_definition_file,
                                name='kubeflowsynapsetest',
                                tenant_id='$(AZ_TENANT_ID)',
                                service_principal_id='$(AZ_CLIENT_ID)',
                                service_principal_password='******',
                                subscription_id='$(AZ_SUBSCRIPTION_ID)',
                                resource_group='kubeflow-demo-rg',
                                command_line_arguments=command_line_arguments,
                                spark_pool_name='kubeflowsynapse',
                                language='',
                                reference_files='',
                                configuration='',
                                tags='',
                                spark_pool_config_file='./src/spark_pool_config.yaml',
                                wait_until_job_finished=True,
                                waiting_timeout_in_seconds=3600,
                                workspace_name='kubeflow-demo'). \
                                apply(use_azure_secret()). \
                                apply(use_image(run_job_image_name))
Beispiel #6
0
def cnn_train(resource_group, workspace, dataset, token):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = dataset  # noqa: E501
    batch = 32
    model_name = 'cnnmodel'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "k8scc01covidmlopsacr.azurecr.io/mlops"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'
    mlflow_url = 'http://mlflow.mlflow:5000'

    exit_op = dsl.ContainerOp(name='Exit Handler',
                              image="curlimages/curl",
                              command=['curl'],
                              arguments=[
                                  '-d',
                                  get_callback_payload(TRAIN_FINISH_EVENT),
                                  callback_url
                              ])

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501

        operations['tensorflow preprocess'] = dsl.ContainerOp(
            name='tensorflow preprocess',
            init_containers=[start_callback],
            image=image_repo_name + '/tensorflow-preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--target', training_dataset,
                '--img_size', image_size, '--zipfile', data_download
            ])

        operations['tensorflow training'] = dsl.ContainerOp(
            name="tensorflow training",
            image=image_repo_name + '/tensorflow-training:latest',
            command=['python'],
            arguments=[
                '/scripts/train.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--epochs', 2, '--batch', batch,
                '--image_size', image_size, '--lr', 0.0001, '--outputs',
                model_folder, '--dataset', training_dataset
            ],
            output_artifact_paths={
                'mlpipeline-metrics': '/mlpipeline-metrics.json',
                'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
            }).apply(use_azstorage_secret()).add_env_variable(
                V1EnvVar(name="RUN_ID",
                         value=dsl.RUN_ID_PLACEHOLDER)).add_env_variable(
                             V1EnvVar(
                                 name="MLFLOW_TRACKING_TOKEN",
                                 value=token)).add_env_variable(
                                     V1EnvVar(
                                         name="MLFLOW_TRACKING_URI",
                                         value=mlflow_url)).add_env_variable(
                                             V1EnvVar(
                                                 name="GIT_PYTHON_REFRESH",
                                                 value='quiet'))  # noqa: E501

        operations['tensorflow training'].after(
            operations['tensorflow preprocess'])  # noqa: E501

        operations['evaluate'] = dsl.ContainerOp(
            name='evaluate',
            image="busybox",
            command=['sh', '-c'],
            arguments=['echo', 'Life is Good!'])
        operations['evaluate'].after(operations['tensorflow training'])

        operations['register kubeflow'] = dsl.ContainerOp(
            name='register kubeflow',
            image=image_repo_name + '/register-kubeflow-artifacts:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name, '--data',
                training_folder, '--dataset', training_dataset, '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register kubeflow'].after(operations['evaluate'])

        operations['register AML'] = dsl.ContainerOp(
            name='register AML',
            image=image_repo_name + '/register-aml:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)", '--service_principal_id',
                "$(AZ_CLIENT_ID)", '--service_principal_password',
                "$(AZ_CLIENT_SECRET)", '--subscription_id',
                "$(AZ_SUBSCRIPTION_ID)", '--resource_group', resource_group,
                '--workspace', workspace, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register AML'].after(operations['register kubeflow'])

        operations['register mlflow'] = dsl.ContainerOp(
            name='register mlflow',
            image=image_repo_name + '/register-mlflow:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--model', 'model', '--model_name',
                model_name, '--experiment_name', 'kubeflow-mlops', '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret()).add_env_variable(
                V1EnvVar(name="MLFLOW_TRACKING_URI",
                         value=mlflow_url)).add_env_variable(
                             V1EnvVar(name="MLFLOW_TRACKING_TOKEN",
                                      value=token))  # noqa: E501
        operations['register mlflow'].after(operations['register AML'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d',
                get_callback_payload("Model is registered"), callback_url
            ])
        operations['finalize'].after(operations['register mlflow'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Beispiel #7
0
def tacosandburritos_train(
    resource_group,
    workspace
):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = 'https://aiadvocate.blob.core.windows.net/public/tacodata.zip'  # noqa: E501
    epochs = 2
    batch = 32
    learning_rate = 0.0001
    model_name = 'tacosandburritos'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "kubeflowyoacr.azurecr.io/mexicanfood"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'

    exit_op = dsl.ContainerOp(
        name='Exit Handler',
        image="curlimages/curl",
        command=['curl'],
        arguments=[
            '-d', get_callback_payload(TRAIN_FINISH_EVENT),
            callback_url
        ]
    )

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501
        operations['preprocess'] = dsl.ContainerOp(
            name='preprocess',
            init_containers=[start_callback],
            image=image_repo_name + '/preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py',
                '--base_path', persistent_volume_path,
                '--data', training_folder,
                '--target', training_dataset,
                '--img_size', image_size,
                '--zipfile', data_download
            ]
        )

        # train
        operations['training'] = dsl.ContainerOp(
            name='training',
            image=image_repo_name + '/training:latest',
            command=['python'],
            arguments=[
                '/scripts/train.py',
                '--base_path', persistent_volume_path,
                '--data', training_folder,
                '--epochs', epochs,
                '--batch', batch,
                '--image_size', image_size,
                '--lr', learning_rate,
                '--outputs', model_folder,
                '--dataset', training_dataset
            ]
        )
        operations['training'].after(operations['preprocess'])

        # register model
        operations['register'] = dsl.ContainerOp(
            name='register',
            image=image_repo_name + '/register:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py',
                '--base_path', persistent_volume_path,
                '--model', 'latest.h5',
                '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)",
                '--service_principal_id', "$(AZ_CLIENT_ID)",
                '--service_principal_password', "$(AZ_CLIENT_SECRET)",
                '--subscription_id', "$(AZ_SUBSCRIPTION_ID)",
                '--resource_group', resource_group,
                '--workspace', workspace,
                '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]
        ).apply(use_azure_secret())
        operations['register'].after(operations['training'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d', get_callback_payload("Model is registered"),
                callback_url
            ]
        )
        operations['finalize'].after(operations['register'])

    # operations['deploy'] = dsl.ContainerOp(
    #     name='deploy',
    #     image=image_repo_name + '/deploy:latest',
    #     command=['sh'],
    #     arguments=[
    #         '/scripts/deploy.sh',
    #         '-n', model_name,
    #         '-m', model_name,
    #         '-t', "$(AZ_TENANT_ID)",
    #         '-r', resource_group,
    #         '-w', workspace,
    #         '-s', "$(AZ_CLIENT_ID)",
    #         '-p', "$(AZ_CLIENT_SECRET)",
    #         '-u', "$(AZ_SUBSCRIPTION_ID)",
    #         '-b', persistent_volume_path,
    #         '-x', dsl.RUN_ID_PLACEHOLDER
    #     ]
    # ).apply(use_azure_secret())
    # operations['deploy'].after(operations['register'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
              name='azure',
              persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                claim_name='azure-managed-disk')
            )
        ).add_volume_mount(k8s_client.V1VolumeMount(
            mount_path='/mnt/azure', name='azure'))
Beispiel #8
0
def tacosandburritos_train(
    resource_group,
    workspace
):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = 'https://aiadvocate.blob.core.windows.net/public/tacodata.zip'  # noqa: E501
    epochs = 2
    batch = 32
    learning_rate = 0.0001
    model_name = 'tacosandburritos'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "kubeflowyoacr.azurecr.io/mexicanfood"

    # preprocess data

    operations['preprocess'] = dsl.ContainerOp(
        name='preprocess',
        image=image_repo_name + '/preprocess:latest',
        command=['python'],
        arguments=[
            '/scripts/data.py',
            '--base_path', persistent_volume_path,
            '--data', training_folder,
            '--target', training_dataset,
            '--img_size', image_size,
            '--zipfile', data_download
        ]
    )

    # train
    operations['training'] = dsl.ContainerOp(
        name='training',
        image=image_repo_name + '/training:latest',
        command=['python'],
        arguments=[
            '/scripts/train.py',
            '--base_path', persistent_volume_path,
            '--data', training_folder,
            '--epochs', epochs,
            '--batch', batch,
            '--image_size', image_size,
            '--lr', learning_rate,
            '--outputs', model_folder,
            '--dataset', training_dataset
        ]
    )
    operations['training'].after(operations['preprocess'])

    # register model
    operations['register'] = dsl.ContainerOp(
        name='register',
        image=image_repo_name + '/register:latest',
        command=['python'],
        arguments=[
            '/scripts/register.py',
            '--base_path', persistent_volume_path,
            '--model', 'latest.h5',
            '--model_name', model_name,
            '--tenant_id', "$(AZ_TENANT_ID)",
            '--service_principal_id', "$(AZ_CLIENT_ID)",
            '--service_principal_password', "$(AZ_CLIENT_SECRET)",
            '--subscription_id', "$(AZ_SUBSCRIPTION_ID)",
            '--resource_group', resource_group,
            '--workspace', workspace,
            '--run_id', dsl.RUN_ID_PLACEHOLDER
        ]
    ).apply(use_azure_secret())

    operations['register'].after(operations['training'])

    operations['deploy'] = dsl.ContainerOp(
        name='deploy',
        image=image_repo_name + '/deploy:latest',
        command=['sh'],
        arguments=[
            '/scripts/deploy.sh',
            '-n', model_name,
            '-m', model_name,
            '-i', '/scripts/inferenceconfig.json',
            '-d', '/scripts/deploymentconfig.json',
            '-t', "$(AZ_TENANT_ID)",
            '-r', resource_group,
            '-w', workspace,
            '-s', "$(AZ_CLIENT_ID)",
            '-p', "$(AZ_CLIENT_SECRET)",
            '-u', "$(AZ_SUBSCRIPTION_ID)",
            '-b', persistent_volume_path,
            '-x', dsl.RUN_ID_PLACEHOLDER
        ]
    ).apply(use_azure_secret())
    operations['deploy'].after(operations['register'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
              name='azure',
              persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                claim_name='azure-managed-disk')
            )
        ).add_volume_mount(k8s_client.V1VolumeMount(
            mount_path='/mnt/azure', name='azure'))
Beispiel #9
0
def tacosandburritos_train(resource_group, workspace, dataset):

    exit_handler = exit_op(
        callback_url=callback_url,
        callback_payload=get_callback_payload(TRAIN_FINISH_EVENT))

    with dsl.ExitHandler(exit_handler):

        operations['data processing on databricks'] = databricks_op(run_id=dsl.RUN_ID_PLACEHOLDER,  # noqa: E501
                                                 notebook_params='{"argument_one":"param one","argument_two":"param two"}'  # noqa: E501
                                                 ).apply(use_databricks_secret()). \
                                                 add_init_container(get_start_callback_container()). \
                                                 apply(
                                                     use_image(databricks_image_name))

        operations['preprocess'] = preprocess_op(base_path=persistent_volume_path,  # noqa: E501
                                                 training_folder=training_folder,  # noqa: E501
                                                 target=training_dataset,
                                                 image_size=image_size,
                                                 zipfile=dataset). \
                                                 apply(
                                                     use_image(preprocess_image_name))

        operations['preprocess'].after(
            operations['data processing on databricks'])  # noqa: E501

        operations['training'] = train_op(base_path=persistent_volume_path,
                                          training_folder=training_folder,
                                          epochs=2,
                                          batch=batch,
                                          image_size=image_size,
                                          lr=0.0001,
                                          model_folder=model_folder,
                                          images=training_dataset,
                                          dataset=operations['preprocess'].outputs['dataset']). \
                                          set_memory_request('16G'). \
                                          add_env_variable(V1EnvVar(name="RUN_ID", value=dsl.RUN_ID_PLACEHOLDER)). \
                                          add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
                                          add_env_variable(V1EnvVar(name="GIT_PYTHON_REFRESH", value='quiet')).apply(use_image(train_image_name))  # noqa: E501, E127

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = evaluate_op(
            model=operations['training'].outputs['model'])
        operations['evaluate'].after(operations['training'])

        operations['register to AML'] = register_op(
            base_path=persistent_volume_path,
            model_file='latest.h5',
            model_name=model_name,
            tenant_id='$(AZ_TENANT_ID)',
            service_principal_id='$(AZ_CLIENT_ID)',
            service_principal_password='******',
            subscription_id='$(AZ_SUBSCRIPTION_ID)',
            resource_group=resource_group,
            workspace=workspace,
            run_id=dsl.RUN_ID_PLACEHOLDER).apply(use_azure_secret()).apply(
                use_image(register_images_name))  # noqa: E501, E127

        operations['register to AML'].after(operations['evaluate'])

        operations['register to mlflow'] = register_mlflow_op(model='model',
                                                              model_name=model_name,
                                                              experiment_name='mexicanfood',
                                                              run_id=dsl.RUN_ID_PLACEHOLDER).apply(use_azure_secret()). \
                                                              add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)).apply(use_image(register_mlflow_image_name))  # noqa: E501

        operations['register to mlflow'].after(operations['register to AML'])

        operations['finalize'] = finalize_op(
            callback_url=callback_url,
            callback_payload=get_callback_payload("Model is registered"))
        operations['finalize'].after(operations['register to mlflow'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Beispiel #10
0
def tacosandburritos_train(resource_group, workspace, dataset):
    """Pipeline steps"""

    persistent_volume_path = '/mnt/azure'
    data_download = dataset  # noqa: E501
    batch = 32
    model_name = 'tacosandburritos'
    operations = {}
    image_size = 160
    training_folder = 'train'
    training_dataset = 'train.txt'
    model_folder = 'model'
    image_repo_name = "kubeflowyoacr.azurecr.io/mexicanfood"
    callback_url = 'kubemlopsbot-svc.kubeflow.svc.cluster.local:8080'
    mlflow_url = 'http://mlflow:5000'

    exit_op = dsl.ContainerOp(name='Exit Handler',
                              image="curlimages/curl",
                              command=['curl'],
                              arguments=[
                                  '-d',
                                  get_callback_payload(TRAIN_FINISH_EVENT),
                                  callback_url
                              ])

    with dsl.ExitHandler(exit_op):
        start_callback = \
            dsl.UserContainer('callback',
                              'curlimages/curl',
                              command=['curl'],
                              args=['-d',
                                    get_callback_payload(TRAIN_START_EVENT), callback_url])  # noqa: E501

        operations['data processing on databricks'] = dsl.ContainerOp(
            name='data processing on databricks',
            init_containers=[start_callback],
            image=image_repo_name + '/databricks-notebook:latest',
            arguments=[
                '-r', dsl.RUN_ID_PLACEHOLDER, '-p',
                '{"argument_one":"param one","argument_two":"param two"}'
            ]).apply(use_databricks_secret())

        operations['preprocess'] = dsl.ContainerOp(
            name='preprocess',
            image=image_repo_name + '/preprocess:latest',
            command=['python'],
            arguments=[
                '/scripts/data.py', '--base_path', persistent_volume_path,
                '--data', training_folder, '--target', training_dataset,
                '--img_size', image_size, '--zipfile', data_download
            ])

        operations['preprocess'].after(
            operations['data processing on databricks'])  # noqa: E501

        #  train
        #  TODO: read set of parameters from config file
        with dsl.ParallelFor([{
                'epochs': 1,
                'lr': 0.0001
        }, {
                'epochs': 2,
                'lr': 0.0002
        }, {
                'epochs': 3,
                'lr': 0.0003
        }]) as item:  # noqa: E501
            operations['training'] = dsl.ContainerOp(
                name="training",
                image=image_repo_name + '/training:latest',
                command=['python'],
                arguments=[
                    '/scripts/train.py', '--base_path', persistent_volume_path,
                    '--data', training_folder, '--epochs', item.epochs,
                    '--batch', batch, '--image_size', image_size, '--lr',
                    item.lr, '--outputs', model_folder, '--dataset',
                    training_dataset
                ],
                output_artifact_paths=
                {  # change output_artifact_paths to file_outputs after this PR is merged https://github.com/kubeflow/pipelines/pull/2334 # noqa: E501
                    'mlpipeline-metrics': '/mlpipeline-metrics.json',
                    'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
                }).add_env_variable(
                    V1EnvVar(name="RUN_ID",
                             value=dsl.RUN_ID_PLACEHOLDER)).add_env_variable(
                                 V1EnvVar(name="MLFLOW_TRACKING_URI",
                                          value=mlflow_url)).add_env_variable(
                                              V1EnvVar(
                                                  name="GIT_PYTHON_REFRESH",
                                                  value='quiet'))  # noqa: E501

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = dsl.ContainerOp(
            name='evaluate',
            image="busybox",
            command=['sh', '-c'],
            arguments=['echo', 'Life is Good!'])
        operations['evaluate'].after(operations['training'])

        # register kubeflow artifcats model
        operations['register to kubeflow'] = dsl.ContainerOp(
            name='register to kubeflow',
            image=image_repo_name + '/registerartifacts:latest',
            command=['python'],
            arguments=[
                '/scripts/registerartifacts.py', '--base_path',
                persistent_volume_path, '--model', 'latest.h5', '--model_name',
                model_name, '--data', training_folder, '--dataset',
                training_dataset, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register to kubeflow'].after(operations['evaluate'])

        # register model
        operations['register to AML'] = dsl.ContainerOp(
            name='register to AML',
            image=image_repo_name + '/register:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--base_path', persistent_volume_path,
                '--model', 'latest.h5', '--model_name', model_name,
                '--tenant_id', "$(AZ_TENANT_ID)", '--service_principal_id',
                "$(AZ_CLIENT_ID)", '--service_principal_password',
                "$(AZ_CLIENT_SECRET)", '--subscription_id',
                "$(AZ_SUBSCRIPTION_ID)", '--resource_group', resource_group,
                '--workspace', workspace, '--run_id', dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret())
        operations['register to AML'].after(operations['register to kubeflow'])

        # register model to mlflow
        operations['register to mlflow'] = dsl.ContainerOp(
            name='register to mlflow',
            image=image_repo_name + '/register-mlflow:latest',
            command=['python'],
            arguments=[
                '/scripts/register.py', '--model', 'model', '--model_name',
                model_name, '--experiment_name', 'mexicanfood', '--run_id',
                dsl.RUN_ID_PLACEHOLDER
            ]).apply(use_azure_secret()).add_env_variable(
                V1EnvVar(name="MLFLOW_TRACKING_URI",
                         value=mlflow_url))  # noqa: E501
        operations['register to mlflow'].after(operations['register to AML'])

        operations['finalize'] = dsl.ContainerOp(
            name='Finalize',
            image="curlimages/curl",
            command=['curl'],
            arguments=[
                '-d',
                get_callback_payload("Model is registered"), callback_url
            ])
        operations['finalize'].after(operations['register to mlflow'])

    # operations['deploy'] = dsl.ContainerOp(
    #     name='deploy',
    #     image=image_repo_name + '/deploy:latest',
    #     command=['sh'],
    #     arguments=[
    #         '/scripts/deploy.sh',
    #         '-n', model_name,
    #         '-m', model_name,
    #         '-t', "$(AZ_TENANT_ID)",
    #         '-r', resource_group,
    #         '-w', workspace,
    #         '-s', "$(AZ_CLIENT_ID)",
    #         '-p', "$(AZ_CLIENT_SECRET)",
    #         '-u', "$(AZ_SUBSCRIPTION_ID)",
    #         '-b', persistent_volume_path,
    #         '-x', dsl.RUN_ID_PLACEHOLDER
    #     ]
    # ).apply(use_azure_secret())
    # operations['deploy'].after(operations['register'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))
Beispiel #11
0
def tacosandburritos_train(resource_group,
                           workspace,
                           dataset,
                           mlflow_experiment_id,
                           azdocallbackinfo=None):

    exit_handler_op = exit_op(
        kfp_host_url="$(KFP_HOST)",
        azdocallbackinfo=azdocallbackinfo,
        run_id=dsl.RUN_ID_PLACEHOLDER,
        tenant_id="$(AZ_TENANT_ID)",
        service_principal_id="$(AZ_CLIENT_ID)",
        service_principal_password="******",
        pat_env="PAT_ENV").apply(use_azure_secret()).apply(
            use_kfp_host_secret()).apply(use_image(exit_image_name)).apply(
                use_secret_var("azdopat", "PAT_ENV", "azdopat"))

    with dsl.ExitHandler(exit_op=exit_handler_op):

        operations['mlflowproject'] = mlflow_project_op(
            mlflow_experiment_id=mlflow_experiment_id,  # noqa: E501
            kf_run_id=dsl.RUN_ID_PLACEHOLDER).apply(
                use_databricks_secret()).apply(
                    use_image(mlflow_project_image_name))  # noqa: E501

        operations['preprocess'] = preprocess_op(
            base_path=persistent_volume_path,  # noqa: E501
            training_folder=training_folder,  # noqa: E501
            target=training_dataset,
            image_size=image_size,
            zipfile=dataset).apply(
                use_image(preprocess_image_name))  # noqa: E501

        operations['preprocess'].after(
            operations['mlflowproject'])  # noqa: E501

        operations['training'] = train_op(base_path=persistent_volume_path,
                                          training_folder=training_folder,
                                          epochs=2,
                                          batch=batch,
                                          image_size=image_size,
                                          lr=0.0001,
                                          model_folder=model_folder,
                                          images=training_dataset,
                                          dataset=operations['preprocess'].outputs['dataset']). \
            set_memory_request('16G'). \
            add_env_variable(V1EnvVar(name="RUN_ID", value=dsl.RUN_ID_PLACEHOLDER)). \
            add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
            add_env_variable(V1EnvVar(name="GIT_PYTHON_REFRESH", value='quiet')). \
            apply(use_image(train_image_name))

        # Spot nodepool target
        # operations['training'].add_toleration(k8s_client.V1Toleration(
        #     key='kubernetes.azure.com/scalesetpriority',
        #     operator='Equal',
        #     value='spot',
        #     effect="NoSchedule"))

        # Virtual/ACI nodepool target
        # operations['training'].add_node_selector_constraint(
        #     label_name='type', value='virtual-kubelet')
        # operations['training'].add_toleration(k8s_client.V1Toleration(
        #     key='virtual-kubelet.io/provider', operator='Exists'))

        operations['training'].after(operations['preprocess'])

        operations['evaluate'] = evaluate_op(
            model=operations['training'].outputs['model'])
        operations['evaluate'].after(operations['training'])

        operations['register to AML'] = register_op(base_path=persistent_volume_path,
                                                    model_file='latest.h5',
                                                    model_name=model_name,
                                                    tenant_id='$(AZ_TENANT_ID)',
                                                    service_principal_id='$(AZ_CLIENT_ID)',
                                                    service_principal_password='******',
                                                    subscription_id='$(AZ_SUBSCRIPTION_ID)',
                                                    resource_group=resource_group,
                                                    workspace=workspace,
                                                    run_id=dsl.RUN_ID_PLACEHOLDER). \
            apply(use_azure_secret()). \
            apply(use_image(register_images_name))

        operations['register to AML'].after(operations['evaluate'])

        operations['register to mlflow'] = register_mlflow_op(model='model',
                                                              model_name=model_name,
                                                              experiment_name='mexicanfood',
                                                              run_id=dsl.RUN_ID_PLACEHOLDER). \
            apply(use_azure_secret()). \
            add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value=mlflow_url)). \
            apply(use_image(register_mlflow_image_name))

        operations['register to mlflow'].after(operations['register to AML'])

    for _, op_1 in operations.items():
        op_1.container.set_image_pull_policy("Always")
        op_1.add_volume(
            k8s_client.V1Volume(
                name='azure',
                persistent_volume_claim=k8s_client.
                V1PersistentVolumeClaimVolumeSource(  # noqa: E501
                    claim_name='azure-managed-file'))).add_volume_mount(
                        k8s_client.V1VolumeMount(mount_path='/mnt/azure',
                                                 name='azure'))