sourceDirectory = "./code"
remoteComputeTargetName = "default"  # make this a parameter
computeTarget = ws.compute_targets[remoteComputeTargetName]

run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
    conda_packages=['numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'],
    pip_packages=[
        'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob',
        'azureml-dataprep'
    ]))
run_config.environment.docker.enabled = True

##------------- Pipeline Parameters

datasetName = PipelineParameter(name="datasetName",
                                default_value="qualitydataset")
datasetStorePath = PipelineParameter(name="datasetStorePath",
                                     default_value="/inputdata/train.csv")
modelName = PipelineParameter(name="modelName",
                              default_value="quality_gbm_model.pkl")

##------------- Data preprocessing step
preprocessingStep = PythonScriptStep(name="preprocess_step",
                                     script_name="preprocess.py",
                                     source_directory=sourceDirectory,
                                     compute_target=computeTarget,
                                     arguments=[
                                         "--datasetName",
                                         datasetName,
                                         "--datasetStorePath",
                                         datasetStorePath,
    #copy all necessary scripts
    files = FilesProviders.get_path_files(
        "../", [os.path.basename(__file__), "__init__.py"])

    for f in files:
        shutil.copy(f, script_folder)
    #add generated config file to script folder
    shutil.copy(generated_config_file, script_folder)

    input_data = input_dataset.as_named_input('input_dataset').as_mount()

    data_store = ws.get_default_datastore()
    prepped_data = PipelineData('prepped_data', datastore=data_store)

    pipeline_mode_param = PipelineParameter(name="mode",
                                            default_value="execute")

    prep_step = PythonScriptStep(
        name='Prepare data',
        source_directory=script_folder,
        script_name='prep_data.py',
        compute_target=compute_target,
        runconfig=pipeline_run_config,
        # Specify dataset as initial input
        inputs=[input_data],
        # Specify PipelineData as output
        outputs=[prepped_data],
        # Also pass as data reference to script
        arguments=[
            '--input_data', input_data, '--prepped_data', prepped_data,
            '--mode', pipeline_mode_param
Beispiel #3
0
def main():
    e = ENV()
    aml_workspace = Workspace.get(
                                  name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group
                                  )
    print(f'workspace:{aml_workspace}')
    
    aml_compute = get_compute(workspace=aml_workspace,
                              compute_name=e.compute_name,
                              vm_size=e.vm_size)
    if aml_compute is not None:
        print(f'compute target: {aml_compute} is created')
    
    environment = get_environment(workspace=aml_workspace,
                                  env_name=e.aml_env_name,
                                  conda_dependencies=e.aml_conda_train_dependent_files,
                                  create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment
    
    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables['DATASTORE_NAME'] = datastore_name
    
    model_name_param = PipelineParameter(name='model_name',default_value=e.model_name)
    dataset_version_param = PipelineParameter(name='dataset_version',default_value=e.dataset_version)
    dataset_file_path = PipelineParameter(name='dataset_file_path',default_value='none')
    
    dataset_name = e.dataset_name
    if dataset_name not in aml_workspace.datasets:
        create_sample_data_csv()
        file_name = 'diabetes.csv'
        if not os.path.exists(file_name):
            raise Exception(f'{file_name} does not exist!')
        datastore = Datastore.get(aml_workspace,datastore_name)
        target_path = 'training/'
        datastore.upload_files(files=[file_name],
                               target_path=target_path,
                               overwrite=True,
                               show_progress=True)
        path_on_datastore = os.path.join(target_path,file_name)
        dataset = Dataset.Tabular.from_delimited_files(path=(datastore,path_on_datastore))
        dataset.register(workspace=aml_workspace,
                         name=dataset_name,
                         description='registered dataset',
                         create_new_version=True,
                         tags={'format':'CSV'})
    
    pipeline_data = PipelineData('train',datastore=aml_workspace.get_default_datastore())
    train_step = PythonScriptStep(script_name=e.train_script_path,
                                  name='train_step',
                                  arguments=['--model-name',model_name_param,
                                             '--dataset-name',dataset_name,
                                             '--dataset-version',dataset_version_param,
                                             '--dataset-file-path',dataset_file_path,
                                             '--step-output',pipeline_data],
                                  compute_target=aml_compute,
                                  runconfig=run_config,
                                  source_directory=e.source_train_directory,
                                  outputs=[pipeline_data],
                                  allow_reuse=False)
    print('Train step created!')    
    
    eval_step = PythonScriptStep(name='eval_step',
                                 script_name=e.eval_script_path,
                                 compute_target=aml_compute,
                                 source_directory=e.source_train_directory,
                                 arguments=['--model-name',model_name_param,
                                            '--allow-run-cancel',e.allow_run_cancel],
                                 runconfig=run_config,
                                 allow_reuse=False)
    print('EVAL step created!')
    
    register_step = PythonScriptStep(name='register_step',
                                     script_name=e.register_script_path,
                                     compute_target=aml_compute,
                                     source_directory=e.source_train_directory,
                                     inputs=[pipeline_data],
                                     arguments=['--model-name',model_name_param,
                                                '--step-input',pipeline_data],
                                     runconfig=run_config,
                                     allow_reuse=False)
    print('Register step created!')
    
    if e.run_evaluation:
        print('evaluation step is included')
        eval_step.run_after(train_step)
        register_step.run_after(eval_step)
        steps = [train_step,eval_step,register_step]
    else:
        print('evaluation step is excluded')
        register_step.run_after(train_step)
        steps = [train_step,register_step]
        
    train_pipeline = Pipeline(workspace=aml_workspace,
                              steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(name=e.pipeline_name,
                                                description='model training pipeline',
                                                version=e.build_id)
    print(f'{published_pipeline.name} is published1')
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = "AML-RG-" + os.environ.get("BASE_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'],
        pip_packages=[
            'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob',
            'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0'
        ]))
    run_config.environment.docker.enabled = True

    datastore_name = 'tfworld'
    container_name = 'azure-service-classifier'
    account_name = 'johndatasets'
    sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2021-06-02T03:40:25Z&st=2020-03-09T19:40:25Z&spr=https&sig=bUwK7AJUj2c%2Fr90Qf8O1sojF0w6wRFgL2c9zMVCWNPA%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token
                                           )

    azure_dataset = Dataset.File.from_files(path=(existing_datastore, 'data'))
    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('input_data1').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(name="max_seq_length",
                                       default_value=128)
    learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(name="num_epochs", default_value=3)
    export_dir = PipelineParameter(name="export_dir",
                                   default_value="./outputs/exports")
    batch_size = PipelineParameter(name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(name="steps_per_epoch",
                                        default_value=100)

    # initialize the TensorFlow estimator
    estimator = TensorFlow(source_directory=sources_directory_train,
                           entry_script=train_script_path,
                           compute_target=aml_compute,
                           framework_version='2.0',
                           use_gpu=True,
                           pip_packages=[
                               'transformers==2.0.0',
                               'azureml-dataprep[fuse,pandas]==1.3.0'
                           ])

    train_step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            "--data_dir", input_data, "--max_seq_length", max_seq_length,
            "--learning_rate", learning_rate, "--num_epochs", num_epochs,
            "--export_dir", export_dir, "--batch_size", batch_size,
            "--steps_per_epoch", steps_per_epoch
        ],
        compute_target=aml_compute,
        inputs=[input_data],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name",
            model_name,
            "--build_id",
            build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
        workspace=aml_workspace,
        experiment_name=experiment_name)

    # Get AKS cluster for deployment
    aks_compute = get_aks(aml_workspace, aks_name)
    if aks_compute is not None:
        print(aks_compute)
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    # register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")

    print(app_secret)

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    # register_model_step = PythonScriptStep(
    #     name="Register New Trained Model",
    #     script_name=register_script_path,
    #     compute_target=aml_compute,
    #     source_directory=sources_directory_train,
    #     arguments=[
    #         "--release_id", release_id,
    #         "--model_name", model_name,
    #     ],
    #     runconfig=run_config,
    #     allow_reuse=False,
    # )
    # print("Step register model created")

    evaluate_step.run_after(train_step)
    # register_model_step.run_after(evaluate_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Beispiel #6
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        create_new=e.rebuild_env,
        enable_docker=True,
        dockerfile='ml_model/preprocess/Dockerfile'
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name  # NOQA: E501

    datastore = Datastore(aml_workspace, name=datastore_name)
    data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name)  # NOQA: E501

    # The version of the input/output dataset can't be determined at pipeline publish time, only run time.  # NOQA: E501
    # Options to store output data:
    # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501
    #     The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images"  # NOQA: E501
    #         output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param))  # NOQA: E501
    #     This option means writing a file locally and upload to the datastore. Fewer dataset, more code.  # NOQA: E501
    # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion  # NOQA: E501
    #     Output dataset can be mounted, so more dataset to maintain, less code.   # NOQA: E501
    # Using Option 2 below.
    output_dataset = OutputFileDatasetConfig(
        name=e.processed_dataset_name,
        destination=(datastore, "/dataset/{output-name}/{run-id}")
    ).register_on_complete(
        name=e.processed_dataset_name)

    preprocess_step = PythonScriptStep(
        name="Preprocess Data with OS cmd",
        script_name='preprocess/preprocess_os_cmd_aml.py',
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--dataset_name", e.dataset_name,
            "--datastore_name", datastore_name,
            "--data_file_path", data_file_path_param,
            "--output_dataset", output_dataset,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Preprocess OS cmd created")

    steps = [preprocess_step]
    preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    preprocess_pipeline._set_experiment_name
    preprocess_pipeline.validate()
    published_pipeline = preprocess_pipeline.publish(
        name=e.preprocessing_pipeline_name,
        description="Data preprocessing OS cmd pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    load_dotenv()
    workspace_name = os.environ.get("WS_NAME")
    resource_group = os.environ.get("RG_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(
        workspace_name,
        resource_group,
        subscription_id,
        tenant_id,
        app_id,
        app_secret)

    print('Now accessing:')
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(
        aml_workspace,
        compute_name,
        vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas',
                        'scikit-learn', 'keras'],
        pip_packages=['azureml-core==1.25.0',
                      'azureml-defaults==1.25.0',
                      'azureml-telemetry==1.25.0',
                      'azureml-train-restclients-hyperdrive==1.25.0',
                      'azureml-train-core==1.25.0',
                      'azureml-dataprep',
                      'tensorflow-gpu==2.0.0',
                      'transformers==2.0.0',
                      'absl-py',
                      'azureml-dataprep',
                      'h5py<3.0.0'])
    )
    # run_config.environment.docker.enabled = True

    datastore_name = 'mtcseattle'
    container_name = 'azure-service-classifier'
    account_name = 'mtcseattle'
    sas_token = '?sv=2020-04-08&st=2021-05-26T04%3A39%3A46Z&se=2022-05-27T04%3A39%3A00Z&sr=c&sp=rl&sig=CTFMEu24bo2X06G%2B%2F2aKiiPZBzvlWHELe15rNFqULUk%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token,
                                           overwrite=True)

    azure_dataset = Dataset.File.from_files(
        path=(existing_datastore, 'data'))

    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('azureservicedata').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(
        name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(
        name="max_seq_length", default_value=128)
    learning_rate = PipelineParameter(
        name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(
        name="num_epochs", default_value=1)
    export_dir = PipelineParameter(
        name="export_dir", default_value="./outputs/model")
    batch_size = PipelineParameter(
        name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(
        name="steps_per_epoch", default_value=1)

    # initialize the PythonScriptStep
    train_step = PythonScriptStep(
        name='Train Model',
        script_name=train_script_path,
        arguments=['--data_dir', input_data,
                   '--max_seq_length', max_seq_length,
                   '--batch_size', batch_size,
                   '--learning_rate', learning_rate,
                   '--steps_per_epoch', steps_per_epoch,
                   '--num_epochs', num_epochs,
                   '--export_dir',export_dir],
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        runconfig=run_config,
        allow_reuse=True)
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name", model_name,
            "--build_id", build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline.",
        version=build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
               workspace=aml_workspace,
               experiment_name=experiment_name)
Beispiel #8
0
def main():
    env = EnvironmentVariables()
    args = add_arguments()

    workspace = get_workspace()

    cpu_cluster_name = env.cpu_cluster_name
    compute = get_or_create_compute(workspace, cpu_cluster_name,
                                    env.compute_vm_size, env.max_nodes)

    environment = Environment.load_from_directory(env.sources_directory_train)
    environment.register(workspace)
    run_configuration = RunConfiguration()
    run_configuration.environment = environment

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=env.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=env.build_id)
    should_tune_hyperparameters_param = PipelineParameter(
        name="should_tune_hyperparameters",
        default_value=env.should_tune_hyperparameters)
    parallelism_level_param = PipelineParameter(
        name="parallelism_level", default_value=env.parallelism_level)
    force_register_param = PipelineParameter(name="force_register",
                                             default_value=env.force_register)

    datastore = get_datastore()

    dataset_name = env.dataset_name
    dataset_path = env.dataset_path
    print(
        f"Creating new dataset version for {dataset_name} in datastore {datastore} from file {dataset_path}"
    )
    temp_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore,
                                                               dataset_path)])
    dataset = temp_dataset.register(workspace=workspace,
                                    name=dataset_name,
                                    description=dataset_name,
                                    tags={'format': 'CSV'},
                                    create_new_version=True)

    train_output = PipelineData('train_output',
                                output_name='train_output',
                                datastore=datastore)

    train_step = PythonScriptStep(
        name="Train model",
        compute_target=compute,
        script_name=env.train_script_name,
        runconfig=run_configuration,
        inputs=[dataset.as_named_input('training')],
        outputs=[train_output],
        arguments=[
            "--build_id", build_id_param, "--model_name", model_name_param,
            "--parallelism_level", parallelism_level_param,
            "--should_tune_hyperparameters", should_tune_hyperparameters_param
        ],
        allow_reuse=False)

    evaluate_step = PythonScriptStep(name="Evaluate model",
                                     compute_target=compute,
                                     script_name=env.evaluate_script_name,
                                     runconfig=run_configuration,
                                     inputs=[train_output],
                                     arguments=[
                                         "--build_id", build_id_param,
                                         "--model_name", model_name_param,
                                         "--train_output", train_output,
                                         "--force_register",
                                         force_register_param
                                     ],
                                     allow_reuse=False)

    register_step = PythonScriptStep(name="Register model",
                                     compute_target=compute,
                                     script_name=env.register_script_name,
                                     runconfig=run_configuration,
                                     inputs=[train_output],
                                     arguments=[
                                         "--build_id", build_id_param,
                                         "--model_name", model_name_param,
                                         "--train_output", train_output
                                     ],
                                     allow_reuse=False)

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)

    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=env.pipeline_name,
        description="Train/Eval/Register if better pipeline",
        version=env.build_id)

    output_file_name = args.output_file_name
    if output_file_name:
        with open(output_file_name, "w") as output_file:
            output_file.write(published_pipeline.id)

    print(
        f"Published pipeline {published_pipeline.name} for build {published_pipeline.version}"
    )
Beispiel #9
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("WORKSPACE_NAME")
    resource_group = os.environ.get("RESOURCE_GROUP_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    generate_report_path = os.environ.get("GENERATE_REPORT_PATH")
    generate_report_name = os.environ.get("GENERATE_REPORT_NAME")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_GPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    ckpt_path = os.environ.get("MODEL_CHECKPOINT_PATH")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    epis_datastore = os.environ.get("EPIS_DATASTORE")
    epis_container = os.environ.get("EPIS_CONTAINER")

    aml_workspace = get_workspace(
        workspace_name,
        resource_group,
        subscription_id,
        tenant_id,
        app_id,
        app_secret)
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        compute_name,
        vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy==1.18.1', 'pandas', 'tensorflow-gpu==2.0.0'],
        pip_packages=['azure', 'azureml-core==1.0.60', 'azureml-tensorboard', 'azure-storage==0.36.0',
                    'tqdm==4.41.1', 'opencv-python==4.1.2.30', 'easydict==1.9', 'matplotlib==3.1.3'])
    )
    run_config.environment.docker.enabled = True
    run_config.environment.docker.gpu_support = True
    run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE

    model_name = PipelineParameter(
        name="model_name", default_value=model_name)
    release_id = PipelineParameter(
        name="release_id", default_value=build_id)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id", release_id,
            "--model_name", model_name,
            "--ckpt_path", ckpt_path,
            "--datastore", epis_datastore,
            "--storage_container", epis_container,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id", release_id,
            "--model_name", model_name,
            "--ckpt_path", ckpt_path,
            "--datastore", epis_datastore,
            "--storage_container", epis_container,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    generate_report_step = PythonScriptStep(
        name="Generate Report Model",
        script_name=generate_report_name,
        compute_target=aml_compute,
        source_directory=generate_report_path,
        arguments=[
            "--release_id", release_id,
            "--model_name", model_name,
            "--ckpt_path", ckpt_path,
            "--datastore", epis_datastore,
            "--storage_container", epis_container,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step generate report created")

    evaluate_step.run_after(train_step)
    generate_report_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, generate_report_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Beispiel #10
0
data_factory_compute = DataFactoryCompute(ws, data_factory_name)
aml_compute = AmlCompute(ws, aml_compute_target)

# We explicitly declare the data we're using in this training pipeline
source_images = DataReference(datastore=source_ds,
                              data_reference_name="original_images",
                              path_on_datastore=default_dataset)
dest_images = DataReference(datastore=ds,
                            data_reference_name="transferred_images",
                            path_on_datastore='training_images')

training_dataset = DataPath(datastore=source_ds,
                            path_on_datastore=default_dataset)

# Parameters make it easy for us to re-run this training pipeline, including for retraining.
model_variant = PipelineParameter(name="model_variant",
                                  default_value='sodacans')
training_dataset_param = (PipelineParameter(name="training_dataset",
                                            default_value=training_dataset),
                          DataPathComputeBinding())

# Copying data into a datastore we manage ensures we can reproduce the model later on.
datatransfer = DataTransferStep(
    name=
    "Copy training data for improved performance and model reproducibility",
    source_data_reference=source_images,
    destination_data_reference=dest_images,
    compute_target=data_factory_compute)

# We pass the trained model from the transfer learning step to the model registration step
model = PipelineData(name="model",
                     datastore=ds,
Beispiel #11
0
def main():
    e = Env()
    print(e.__dict__)
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=e.rebuild_env)  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        raise ValueError(
            f"can't find dataset {dataset_name} in datastore {datastore_name}")

    # Create PipelineData to pass data between steps
    model_data = PipelineData("model_data",
                              datastore=aml_workspace.get_default_datastore())
    train_ds = (PipelineData("train_ds",
                             datastore=aml_workspace.get_default_datastore()).
                as_dataset().parse_delimited_files().register(
                    name="train", create_new_version=True))
    test_ds = (PipelineData(
        "test_ds", datastore=aml_workspace.get_default_datastore()).as_dataset(
        ).parse_delimited_files().register(name="test",
                                           create_new_version=True))

    prepare_step = PythonScriptStep(
        name="Prepare Data",
        script_name=e.prepare_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[train_ds, test_ds],
        arguments=[
            "--dataset_version", dataset_version_param, "--data_file_path",
            data_file_path_param, "--dataset_name", dataset_name,
            "--caller_run_id", caller_run_id_param, "--train_ds", train_ds,
            "--test_ds", test_ds
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Prepare created")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[
            train_ds.as_named_input("training_data"),
            test_ds.as_named_input("testing_data")
        ],
        outputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--model_data", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--step_input", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [prepare_step, train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [prepare_step, train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    # datastore and dataset names are fixed for this pipeline, however
    # data_file_path can be specified for registering new versions of dataset
    # Note that AML pipeline parameters don't take empty string as default, "" won't work  # NOQA: E501
    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    data_file_path_param = PipelineParameter(
        name="data_file_path", default_value="nopath")  # NOQA: E501
    ml_params = PipelineParameter(name="ml_params",
                                  default_value="default")  # NOQA: E501

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train/train_aml.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--data_file_path",
            data_file_path_param,
            "--dataset_name",
            e.processed_dataset_name,
            "--datastore_name",
            datastore_name,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name="evaluate/evaluate_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name="register/register_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.training_pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable run configuration environment
    # Read definition from diabetes_regression/azureml_environment.json
    environment = Environment.load_from_directory(e.sources_directory_train)
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        environment.environment_variables["BUILDURI_BASE"] = builduri_base
    environment.register(aml_workspace)

    run_config = RunConfiguration()
    run_config.environment = environment

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if (dataset_name not in aml_workspace.datasets):
        # Create dataset from diabetes sample data
        sample_data = load_diabetes()
        df = pd.DataFrame(data=sample_data.data,
                          columns=sample_data.feature_names)
        df['Y'] = sample_data.target
        file_name = 'diabetes.csv'
        df.to_csv(file_name, index=False)

        # Upload file to default datastore in workspace
        default_ds = aml_workspace.get_default_datastore()
        target_path = 'training-data/'
        default_ds.upload_files(files=[file_name],
                                target_path=target_path,
                                overwrite=True,
                                show_progress=False)

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(default_ds, path_on_datastore))
        dataset = dataset.register(workspace=aml_workspace,
                                   name=dataset_name,
                                   description='diabetes training data',
                                   tags={'format': 'CSV'},
                                   create_new_version=True)

    # Get the dataset
    dataset = Dataset.get_by_name(aml_workspace, dataset_name)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[dataset.as_named_input('training_data')],
        outputs=[pipeline_data],
        arguments=[
            "--build_id", build_id_param, "--model_name", model_name_param,
            "--step_output", pipeline_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Beispiel #14
0
    def __init__(self, request_id: str, use_url: bool,
                 input_container_sas: Optional[str],
                 internal_datastore: Mapping[str, str], model_name: str):
        try:
            self.request_id = request_id

            aml_config = api_config.AML_CONFIG

            self.ws = Workspace(
                subscription_id=aml_config['subscription_id'],
                resource_group=aml_config['resource_group'],
                workspace_name=aml_config['workspace_name'],
                auth=svc_pr
            )
            print('AMLCompute constructor, AML workspace obtained.')

            internal_dir, output_dir = self._get_data_references(
                request_id, internal_datastore)

            aml_compute_name = aml_config['aml_compute_name']
            compute_target = self.ws.compute_targets[aml_compute_name]

            pip_packages = ['tensorflow-gpu==1.12.0',
                            'pillow',
                            'numpy',
                            'azure-storage-blob==2.1.0',
                            'azureml-defaults==1.0.41']
            dependencies = CondaDependencies.create(pip_packages=pip_packages)

            amlcompute_run_config = RunConfiguration(
                conda_dependencies=dependencies)
            amlcompute_run_config.environment.docker.enabled = True
            amlcompute_run_config.environment.docker.gpu_support = True
            amlcompute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE  # pylint: disable=line-too-long
            amlcompute_run_config.environment.spark.precache_packages = False

            # default values are required and need to be literal values or data
            # references as JSON
            param_job_id = PipelineParameter(
                name='param_job_id', default_value='default_job_id')
            param_begin_index = PipelineParameter(
                name='param_begin_index', default_value=0)
            param_end_index = PipelineParameter(
                name='param_end_index', default_value=0)
            param_detection_threshold = PipelineParameter(
                name='param_detection_threshold', default_value=0.05)

            batch_score_step = PythonScriptStep(
                aml_config['script_name'],
                source_directory=aml_config['source_dir'],
                hash_paths=['.'],  # include all contents of source_directory
                name='batch_scoring',
                arguments=[
                    '--job_id', param_job_id,
                    '--request_id', request_id,
                    '--model_name', model_name,
                    '--input_container_sas', input_container_sas,  # can be None
                    '--use_url', use_url,
                    '--internal_dir', internal_dir,
                    '--begin_index', param_begin_index,  # inclusive
                    '--end_index', param_end_index,  # exclusive
                    '--output_dir', output_dir,
                    '--detection_threshold', param_detection_threshold],
                compute_target=compute_target,
                inputs=[internal_dir],
                outputs=[output_dir],
                runconfig=amlcompute_run_config)
            self.pipeline = Pipeline(
                workspace=self.ws, steps=[batch_score_step])
            self.aml_config = aml_config
            print('AMLCompute constructor all good.')
        except Exception as e:
            raise RuntimeError('Error setting up AML Compute resource: {}.'.format(e))
Beispiel #15
0
pipeline_run1 = Experiment(ws, 'Titanic_Pipeline_Notebook').submit(pipeline1)
pipeline_run1.wait_for_completion()

# RunDetails(pipeline_run1).show()

step_runs = pipeline_run1.get_children()
for step_run in step_runs:
    status = step_run.get_status()
    print('Script:', step_run.name, 'status:', status)

    # Change this if you want to see details even if the Step has succeeded.
    if status == "Failed":
        joblog = step_run.get_job_log()
        print('job log:', joblog)

pipeline_param = PipelineParameter(name="pipeline_arg", default_value=10)

published_pipeline1 = pipeline_run1.publish_pipeline(
    name="Published_Titanic_Pipeline_Notebook",
    description="Titanic_Pipeline_Notebook Published Pipeline Description",
    version="1.0")

from azureml.pipeline.core import PublishedPipeline
import requests

response = requests.post(published_pipeline1.endpoint,
                         json={
                             "ExperimentName": "Titanic_Pipeline_Notebook",
                             "ParameterAssignments": {
                                 "pipeline_arg": 20
                             }
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Beispiel #17
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = get_workspace(e.workspace_name, e.resource_group,
                                  e.subscription_id, e.tenant_id, e.app_id,
                                  e.app_secret)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name",
                                   default_value=e.model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a run configuration environment
    conda_deps_file = "diabetes_regression/training_dependencies.yml"
    conda_deps = CondaDependencies(conda_deps_file)
    run_config = RunConfiguration(conda_dependencies=conda_deps)
    run_config.environment.docker.enabled = True
    config_envvar = {}
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        config_envvar["BUILDURI_BASE"] = builduri_base
    run_config.environment.environment_variables = config_envvar

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)

    dataset_name = ""
    if (e.datastore_name is not None and e.datafile_name is not None):
        dataset_name = e.dataset_name
        datastore = Datastore.get(aml_workspace, e.datastore_name)
        data_path = [(datastore, e.datafile_name)]
        dataset = Dataset.Tabular.from_delimited_files(path=data_path)
        dataset.register(workspace=aml_workspace,
                         name=e.dataset_name,
                         description="dataset with training data",
                         create_new_version=True)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    config_envvar = {}
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        config_envvar["BUILDURI_BASE"] = builduri_base
    run_config.environment.environment_variables = config_envvar

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)
    hyperparameter_alpha_param = PipelineParameter(name="hyperparameter_alpha",
                                                   default_value=0.5)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--alpha",
            hyperparameter_alpha_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Beispiel #20
0
    def __init__(self, request_id, input_container_sas, internal_datastore):
        try:
            aml_config = api_config.AML_CONFIG

            self.ws = Workspace(subscription_id=aml_config['subscription_id'],
                                resource_group=aml_config['resource_group'],
                                workspace_name=aml_config['workspace_name'],
                                auth=svc_pr)
            print('AMLCompute constructor, AML workspace obtained.')

            internal_dir, output_dir = self._get_data_references(
                request_id, internal_datastore)

            compute_target = self.ws.compute_targets[
                aml_config['aml_compute_name']]

            dependencies = CondaDependencies.create(pip_packages=[
                'tensorflow-gpu==1.9.0', 'pillow', 'numpy', 'azure',
                'azure-storage-blob', 'azureml-defaults'
            ])

            amlcompute_run_config = RunConfiguration(
                conda_dependencies=dependencies)
            amlcompute_run_config.environment.docker.enabled = True
            amlcompute_run_config.environment.docker.gpu_support = True
            amlcompute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
            amlcompute_run_config.environment.spark.precache_packages = False

            # default values are required and need to be literal values or data references as JSON
            param_job_id = PipelineParameter(name='param_job_id',
                                             default_value='default_job_id')

            param_begin_index = PipelineParameter(name='param_begin_index',
                                                  default_value=0)
            param_end_index = PipelineParameter(name='param_end_index',
                                                default_value=0)

            param_detection_threshold = PipelineParameter(
                name='param_detection_threshold', default_value=0.05)
            param_batch_size = PipelineParameter(name='param_batch_size',
                                                 default_value=8)

            batch_score_step = PythonScriptStep(
                aml_config['script_name'],
                source_directory=aml_config['source_dir'],
                name='batch_scoring',
                arguments=[
                    '--job_id',
                    param_job_id,
                    '--model_name',
                    aml_config['model_name'],
                    '--input_container_sas',
                    input_container_sas,
                    '--internal_dir',
                    internal_dir,
                    '--begin_index',
                    param_begin_index,  # inclusive
                    '--end_index',
                    param_end_index,  # exclusive
                    '--output_dir',
                    output_dir,
                    '--detection_threshold',
                    param_detection_threshold,
                    '--batch_size',
                    param_batch_size
                ],
                compute_target=compute_target,
                inputs=[internal_dir],
                outputs=[output_dir],
                runconfig=amlcompute_run_config)

            self.pipeline = Pipeline(workspace=self.ws,
                                     steps=[batch_score_step])
            self.aml_config = aml_config
            print('AMLCompute constructor all good.')
        except Exception as e:
            raise RuntimeError(
                'Error in setting up AML Compute resource: {}.'.format(str(e)))
Beispiel #21
0
    create_or_update_schedule,
    get_or_create_compute,
    get_workspace,
    publish_pipeline,
)
from config import ml_pipeline_test_config as config

ws = get_workspace(config)

compute_target = get_or_create_compute(ws, **config["compute"])

###
# Define and set up pipeline
###

pipeline_param = PipelineParameter(name="my_arg", default_value="default")

my_step = PythonScriptStep(
    name="My Script Step",
    script_name="scriptstep.py",
    arguments=[pipeline_param],
    inputs=[],
    outputs=[],
    compute_target=compute_target,
    source_directory="src",
    allow_reuse=True,
    runconfig=RunConfiguration(conda_dependencies=CondaDependencies(
        conda_dependencies_file_path="environment.yml")),
)

pipeline_id, pipeline_endpoint = publish_pipeline(ws, [my_step], "blabla")
Beispiel #22
0
args = parser.parse_args()

# Get workspace
ws = Workspace.from_config(path="aml_config/config.json", auth=cli_auth)
def_blob_store = Datastore(ws, "workspaceblobstore")

# Get AML Compute name and Experiment Name
with open("aml_config/security_config.json") as f:
    config = json.load(f)

experiment_name = config["experiment_name"]
aml_cluster_name = config["aml_cluster_name"]
aml_pipeline_name = "training-pipeline"
#model_name = config["model_name"]
model_name = PipelineParameter(name="model_name",
                               default_value="sklearn_regression_model.pkl")

source_directory = "code"

# Run Config
# Declare packages dependencies required in the pipeline (these can also be expressed as a YML file)
# cd = CondaDependencies.create(pip_packages=["azureml-defaults", 'tensorflow==1.8.0'])
cd = CondaDependencies("aml_config/conda_dependencies.yml")

run_config = RunConfiguration(conda_dependencies=cd)
aml_compute = ws.compute_targets[aml_cluster_name]
run_config.environment.docker.enabled = True
run_config.environment.spark.precache_packages = False

jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store)
Beispiel #23
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # # Check to see if dataset exists
    # if (dataset_name not in aml_workspace.datasets):
    #     # Create dataset from lacemlops sample data
    #     sample_data = load_lacemlops()
    #     df = pd.DataFrame(
    #         data=sample_data.data,
    #         columns=sample_data.feature_names)
    #     df['Y'] = sample_data.target
    #     file_name = 'lacemlops.csv'
    #     df.to_csv(file_name, index=False)

    #     # Upload file to default datastore in workspace
    #     datatstore = Datastore.get(aml_workspace, datastore_name)
    #     target_path = 'training-data/'
    #     datatstore.upload_files(
    #         files=[file_name],
    #         target_path=target_path,
    #         overwrite=True,
    #         show_progress=False)

    #     # Register dataset
    #     path_on_datastore = os.path.join(target_path, file_name)
    #     dataset = Dataset.Tabular.from_delimited_files(
    #         path=(datatstore, path_on_datastore))
    #     dataset = dataset.register(
    #         workspace=aml_workspace,
    #         name=dataset_name,
    #         description='lacemlops training data',
    #         tags={'format': 'CSV'},
    #         create_new_version=True)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
    run_config = RunConfiguration()
    run_config.environment.docker.enabled = True
    run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    run_config.environment.python.user_managed_dependencies = False
    pip_packages = [
        "azureml-sdk==1.0.17", "scikit-learn==0.21.3", "download==0.3.4",
        "pandas==0.25.1", "spacy==2.1.4", "numpy==1.17.2"
    ]

    run_config.environment.python.conda_dependencies = CondaDependencies.create(
        pip_packages=pip_packages)

    pipeline_params = []
    for k, v in vars(auth_params).items():
        pipeline_params.append("--" + k)
        pipeline_params.append(PipelineParameter(name=k, default_value=v))

    auth_params = pipeline_params.copy()
    pipeline_params += ["--processed_data_ref", processed_data_ref]
    pipeline_params += ["--input_data_ref", input_data_ref]
    process_step = PythonScriptStep(script_name="process.py",
                                    arguments=pipeline_params,
                                    inputs=[input_data_ref],
                                    outputs=[processed_data_ref],
                                    compute_target=compute_target_cpu,
                                    source_directory='./',
                                    runconfig=run_config)
    #     step 2

    cluster_name = "gpucluster"
    try:
Beispiel #25
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("RESOURCE_GROUP")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    deploy_script_path = os.environ.get("DEPLOY_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("DEPLOY_PIPELINE_NAME")
    service_name = os.environ.get("DEPLOY_SERVICE_NAME")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    conda_dependencies = CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn'],
        pip_packages=[
            'azureml-core==1.0.72.*', 'azureml-sdk==1.0.72.*', 'azure-storage',
            'azure-storage-blob', 'azureml-dataprep',
            'azureml-datadrift==1.0.72.*'
        ],
        pin_sdk_version=False)

    print(conda_dependencies.serialize_to_string())

    run_config = RunConfiguration(framework='Python',
                                  conda_dependencies=conda_dependencies)
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    print(model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")
    print(release_id)
    service_name = PipelineParameter(name="service_name",
                                     default_value=service_name)
    print(service_name)

    deploy_step = PythonScriptStep(
        name="Deploy Model",
        script_name=deploy_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id", release_id, "--model_name", model_name,
            "--service_name", service_name
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Deploy created")

    steps = [deploy_step]

    deploy_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    deploy_pipeline.validate()
    published_pipeline = deploy_pipeline.publish(
        name=pipeline_name,
        description="Model deploy  pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
 def __create_pipeline_parameter(self, pipeline_parameter_name,
                                 pipeline_parameter_value):
     return PipelineParameter(name=pipeline_parameter_name,
                              default_value=pipeline_parameter_value)
Beispiel #27
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu)
    if aml_compute_cpu is not None:
        print(aml_compute_cpu)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    def_blob_store = Datastore(aml_workspace, "workspaceblobstore")
    jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store)
    config_suffix = datetime.datetime.now().strftime("%Y%m%d%H")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        # inputs=[jsonconfigs],
        outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_model_step = PythonScriptStep(
        name="Register New Trained Model",
        script_name=register_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step register model created")

    evaluate_step.run_after(train_step)
    register_model_step.run_after(evaluate_step)
    steps = [register_model_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name="training-pipeline",
        description="Model training/retraining pipeline")

    train_pipeline_json = {}
    train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint
    json_file_path = "ml_service/pipelines/train_pipeline.json"
    with open(json_file_path, "w") as outfile:
        json.dump(train_pipeline_json, outfile)
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    data_path = os.environ.get("DATA_PATH_DATASTORE")
    model_data_path = os.environ.get("MODEL_DATA_PATH_DATASTORE")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    ds = aml_workspace.get_default_datastore()

    dataref_folder = ds.path(data_path).as_mount()
    model_dataref = ds.path(model_data_path).as_mount()

    # NEED those two folders mounted on datastore and env variables specified in variable groups

    #ds.upload(src_dir='./VOCdevkit', target_path='VOCdevkit', overwrite=True, show_progress=True)
    #ds.upload(src_dir='./model_data', target_path='VOCmodel_data', overwrite=True, show_progress=True)

    yoloEstimator = TensorFlow(
        source_directory=sources_directory_train + '/training',
        compute_target=aml_compute,
        entry_script=train_script_path,
        pip_packages=[
            'keras', 'pillow', 'matplotlib', 'onnxmltools', 'keras2onnx==1.5.1'
        ],  # recent versions of keras2onnx give conversion issues 
        use_gpu=True,
        framework_version='1.13')

    train_step = EstimatorStep(name="Train & Convert Model",
                               estimator=yoloEstimator,
                               estimator_entry_script_arguments=[
                                   "--release_id", release_id, "--model_name",
                                   model_name, "--data_folder", dataref_folder,
                                   "--model_path", model_dataref
                               ],
                               runconfig_pipeline_params=None,
                               inputs=[dataref_folder, model_dataref],
                               compute_target=aml_compute,
                               allow_reuse=False)
    print("Step Train & Convert created")

    train_pipeline = Pipeline(workspace=aml_workspace, steps=[train_step])
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # This call creates an example CSV from sklearn sample data. If you
        # have already bootstrapped your project, you can comment this line
        # out and use your own CSV.
        create_sample_data_csv()

        # Use a CSV to read in the data set.
        file_name = "safedriver.csv"

        if not os.path.exists(file_name):
            raise Exception(
                'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.'  # NOQA: E501
                % file_name)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data/"
        datatstore.upload_files(
            files=[file_name],
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="safedriver training data",
            tags={"format": "CSV"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],  # NOQA: E501
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Beispiel #30
0
def main():
    e = Env()
    
    from azureml.core.authentication import InteractiveLoginAuthentication

    myten=os.environ.get("AZURE_TENANT_ID")
    interactive_auth = InteractiveLoginAuthentication(tenant_id=os.environ.get("AZURE_TENANT_ID"))
    subscription=os.environ.get("CSUBSCRIPTION")
    workspace_name=e.workspace_name
    resource_group=e.resource_group

    aml_workspace = Workspace.get(
        name = workspace_name,
        subscription_id = subscription,
        resource_group=resource_group,
        auth=interactive_auth
    )

    from ml_service.util.attach_compute import get_compute

    # Get Azure machine learning cluster
    # If not present then get_compute will create a compute based on environment variables

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    print("SDK version: ", azureml.core.VERSION)

    ## Variable names that can be passed in as parameter values
    from azureml.pipeline.core.graph import PipelineParameter
    from azureml.core import Datastore

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)
    dataset_version_param = PipelineParameter(
        name="dataset_version", default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(
        name="data_file_path", default_value="none")
    caller_run_id_param = PipelineParameter(
        name="caller_run_id", default_value="none")
    #model_path = PipelineParameter(
    #    name="model_path", default_value=e.model_path)    

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    # Get the datastore whether it is the default or named store
    datastore = Datastore.get(aml_workspace, datastore_name)
    dataset_name = e.dataset_name

    # Create a reusable Azure ML environment
    from ml_service.util.manage_environment import get_environment
    from azureml.core import Environment

    # RUN Configuration
    ## Must have this process to work with AzureML-SDK 1.0.85
    from azureml.core.runconfig import RunConfiguration, DEFAULT_CPU_IMAGE
    from azureml.core.conda_dependencies import CondaDependencies

    try:
        app_env=Environment(name="smartschedule_env")
        app_env.register(workspace=aml_workspace)
    except:
        print("Environment not found")
    
    # Create a new runconfig object
    aml_run_config = RunConfiguration()

    aml_run_config.environment.environment_variables["DATASTORE_NAME"] = e.datastore_name  # NOQA: E501

    # Use the aml_compute you created above. 
    aml_run_config.target = aml_compute

    # Enable Docker
    aml_run_config.environment.docker.enabled = True

    # Set Docker base image to the default CPU-based image
    aml_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    #aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:0.2.1"

    # Use conda_dependencies.yml to create a conda environment in the Docker image for execution
    aml_run_config.environment.python.user_managed_dependencies = False

    app_conda_deps=CondaDependencies.create(
        conda_packages=['pandas','scikit-learn', 'libgcc','pyodbc', 'sqlalchemy', 'py-xgboost==0.90'], 
        pip_packages=['azureml-sdk[automl,explain,contrib,interpret]==1.4.0', 'xgboost==0.90', 'azureml-dataprep==1.4.6', 'pyarrow', 'azureml-defaults==1.4.0', 'azureml-train-automl-runtime==1.4.0'], pin_sdk_version=False)

    # Specify CondaDependencies obj, add necessary packages
    aml_run_config.environment.python.conda_dependencies = app_conda_deps

    print ("Run configuration created.")
    from azure.common.credentials import ServicePrincipalCredentials
    #from azure.keyvault import KeyVaultClient, KeyVaultAuthentication

    from azure.keyvault.secrets import SecretClient
    from azure.identity import DefaultAzureCredential
    import pandas as pd
    #import sqlalchemy as sql
    import pyodbc

    def get_data(sql_string, columns):
        credentials = None
        credential = DefaultAzureCredential()

        secret_client = SecretClient("https://smrtschd-aml-kv.vault.azure.net", credential=credential)    
        secret = secret_client.get_secret("database-connection")

        #client = KeyVaultClient(KeyVaultAuthentication(auth_callback))
        #secret_bundle = client.get_secret("https://smrtschd-aml-kv.vault.azure.net", "database-connection", "")

        server = 'starlims-sql.database.windows.net'
        database = 'QM12_DATA_AUTOMATION'
        username = '******'
        password = secret.value
        driver= '{ODBC Driver 17 for SQL Server}'
        conn = pyodbc.connect('Driver='+driver+';'+
                            'Server='+server+';'+
                            'Database='+database+';'+
                            'PORT=1433;'+
                            'UID='+username+';'+
                            'PWD='+password+'; MARS_Connection=Yes'
        )

        try:
            SQL_Query = pd.read_sql_query(sql_string, conn)

            df = pd.DataFrame(SQL_Query, columns=columns)
            return df
        except Exception as e:
            print(e)
            raise

    sql_str = "SELECT " \
            "  Dept " \
            ", Method " \
            ", Servgrp " \
            ", Runno " \
            ", TestNo " \
            ", Testcode " \
            ", Total_Duration_Min " \
            ", Total_Duration_Hr " \
            ", Usrnam " \
            ", Eqid " \
            ", Eqtype " \
        "FROM dbo.Draft " \
        "order by TESTCODE, RUNNO, dept, method;"

    columns = ["Dept", "Method", "Servgrp", "Runno", "TestNo", "Testcode", "Total_Duration_Min", "Total_Duration_Hr", "Usrnam", "Eqid","Eqtype"]

    from azureml.core import Dataset
    from sklearn.model_selection import train_test_split

    if (e.train_dataset_name not in aml_workspace.datasets):

        
        df = get_data(sql_str, columns)

        train_df, test_df=train_test_split(df, test_size=0.2)

        MY_DIR = "data"

        CHECK_FOLDER = os.path.isdir(MY_DIR)

        if not CHECK_FOLDER:
            os.makedirs(MY_DIR)
        else:
            print("Folder ", MY_DIR, " is already created")

        #files = ["data/analyst_tests.csv"]
        files = ["data/train_data.csv","data/test_data.csv"]

        def_file_store = Datastore(aml_workspace, "workspacefilestore")

        dtfrm = df.to_csv(files[0], header=True, index=False)

        train_dataframe=train_df.to_csv(files[0], header=True, index=False)
        test_dataframe=test_df.to_csv(files[1], header=True, index=False)
        datastore.upload_files(
            files=files,
            target_path='data/',
            overwrite=True
        )

        from azureml.data.data_reference import DataReference

        blob_input_data_test=DataReference(
            datastore=datastore,
            data_reference_name="smartschedulertest",
            path_on_datastore="data/test_data.csv"
        )
        test_data=Dataset.Tabular.from_delimited_files(blob_input_data_test)
        test_data.register(aml_workspace, e.test_dataset_name, create_new_version=True)

        blob_input_data_train=DataReference(
            datastore=datastore,
            data_reference_name="smartschedulertrain",
            path_on_datastore="data/train_data.csv"
        )
        train_data=Dataset.Tabular.from_delimited_files(blob_input_data_train)
        train_data.register(aml_workspace, e.train_dataset_name, create_new_version=True)

    else:
        from azureml.data.data_reference import DataReference
        print("getting from the datastore instead of uploading")

        train_data=Dataset.get_by_name(aml_workspace, name=e.train_dataset_name)
        test_data=Dataset.get_by_name(aml_workspace, name=e.test_dataset_name)

    # check the training dataset to make sure it has at least 50 records.
    tdf=train_data.to_pandas_dataframe().head(5)

    print(tdf.shape)
    print(tdf)

    # display the first five rows of the data
    # create a variable that can be used for other purposes
    df=train_data.to_pandas_dataframe().head()

    label_column="Total_Duration_Min"

    import random
    import string

    def randomString(stringLength=15):
        letters = string.ascii_lowercase
        return ''.join(random.choice(letters) for i in range(stringLength))

    from azureml.core import Experiment

    experiment = Experiment(aml_workspace, "SmartScheduler_Pipeline")


    import logging

    aml_name = 'smart_scheduler_' + randomString(5)
    print(aml_name)

    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import StrMethodFormatter

    print(df.head(5))
    print(df.shape)
    print(df.dtypes)

    #df.hist(column='Dept')
    list(df.columns.values)

    # Remove Features that are not necessary.
    #df.hist(column="Servgrp", bins=4)
    train_data=train_data.drop_columns(["Runno","TestNo","Total_Duration_Hr"])
    test_data=test_data.drop_columns(["Runno","TestNo","Total_Duration_Hr"])

    print(train_data.to_pandas_dataframe())
    print(test_data.to_pandas_dataframe())

    from azureml.automl.core.featurization import FeaturizationConfig

    # some of the columns could be change to one hot encoding especially if the categorical column
    featurization_config=FeaturizationConfig()
    featurization_config.blocked_transformers=['LabelEncoder']
    featurization_config.add_column_purpose('Dept', 'CategoricalHash')
    featurization_config.add_transformer_params('HashOneHotEncoder',['Method'], {"number_of_bits":3})
    featurization_config.add_column_purpose('Servgrp', 'CategoricalHash')
    featurization_config.add_column_purpose('Testcode', 'Numeric')
    featurization_config.add_column_purpose('Usrnam', 'CategoricalHash')
    featurization_config.add_column_purpose('Eqid', 'CategoricalHash')
    featurization_config.add_column_purpose('Eqtype', 'CategoricalHash')

    from azureml.pipeline.core import Pipeline, PipelineData
    from azureml.pipeline.steps import PythonScriptStep

    #train_model_folder = './scripts/trainmodel'

    automl_settings = {
        "iteration_timeout_minutes": 5,
        "iterations": 5,
        "enable_early_stopping": True,
        "primary_metric": 'spearman_correlation',
        "verbosity": logging.INFO,
        "n_cross_validation":5
    }

    automl_config = AutoMLConfig(task="regression",
                    debug_log='automated_ml_errors.log',
                    #path = train_model_folder,
                    training_data=train_data,
                    featurization=featurization_config,
                    blacklist_models=['XGBoostRegressor'],
                    label_column_name=label_column,
                    compute_target=aml_compute,
                    **automl_settings)

    from azureml.pipeline.steps import AutoMLStep
    from azureml.pipeline.core import TrainingOutput

    metrics_output_name = 'metrics_output'
    best_model_output_name='best_model_output'

    metrics_data = PipelineData(name = 'metrics_data',
                    datastore = datastore,
                    pipeline_output_name=metrics_output_name,
                    training_output=TrainingOutput(type='Metrics'))

    model_data = PipelineData(name='model_data',
                datastore=datastore,
                pipeline_output_name=best_model_output_name,
                training_output=TrainingOutput(type='Model'))

    trainWithAutomlStep = AutoMLStep(
                        name=aml_name,
                        automl_config=automl_config,
                        passthru_automl_config=False,
                        outputs=[metrics_data, model_data],
                        allow_reuse=True
    )

    evaluate_step = PythonScriptStep(
        name="Evaluate Model",
        script_name='./evaluate/evaluate_model.py',
        #  e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory='../app',
        arguments=[
            "--model_name", model_name_param,
            "--allow_run_cancel", e.allow_run_cancel
        ]
    )

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name='register/register_model2.py', #e.register_script_path,
        compute_target=aml_compute,
        source_directory='../app',
        inputs=[model_data],
        arguments=[
            "--model_name", model_name_param,
            "--model_path", model_data,
            "--ds_name", e.train_dataset_name
        ],
        runconfig=aml_run_config,
        allow_reuse=False
    )

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(trainWithAutomlStep)
        register_step.run_after(evaluate_step)
        pipeline_steps = [ trainWithAutomlStep, evaluate_step, register_step ]
    else:
        print("Exclude the evaluation step and run register step")
        register_step.run_after(trainWithAutomlStep)
        pipeline_steps = [ trainWithAutomlStep, register_step ]

    print( "this is the value for execute pipeline: {}".format(e.execute_pipeline))

    if( (e.execute_pipeline).lower() =='true' ):
        # Execute the pipe normally during testing and debugging
        print("Pipeline submitted for execution.")
        pipeline = Pipeline(workspace = aml_workspace, steps=pipeline_steps)
        pipeline_run = experiment.submit(pipeline)
        pipeline_run.wait_for_completion()
        print("Pipeline is built.")
    else:
        # Generates pipeline that will be called in ML Ops
        train_pipeline = Pipeline(workspace=aml_workspace, steps=pipeline_steps)
        train_pipeline._set_experiment_name
        train_pipeline.validate()
        published_pipeline = train_pipeline.publish(
            name=e.pipeline_name,
            description="Model training/retraining pipeline",
            version=e.build_id
        )
        print(f'Published pipeline: {published_pipeline.name}')
        print(f'for build {published_pipeline.version}')