def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # Source dir for the data.
        src_dir = "data"

        if not os.path.exists(src_dir):
            raise Exception('Could not find dataset at "%s".'  # NOQA: E501
                            % src_dir)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data"
        datatstore.upload(
            src_dir=src_dir,
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = [(datatstore, target_path)]
        dataset = Dataset.File.from_files(path=path_on_datastore)
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="img_class training data",
            tags={"format": "gz"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],  # NOQA: E501
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print(f"get_workspace: {aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute: {aml_compute}")

    # Prepare the dataset input
    data_store = aml_workspace.get_default_datastore()
    print("data_store: %s" % data_store.name)

    # Parameters
    sources_directory_train = e.sources_directory_train
    build_id = e.build_id
    pipeline_name = 'Prepare Data Pipeline'
    train_ds_name = e.dataset_name
    train_data_path = e.datafile_path

    # Register the train dataset
    if (train_ds_name not in aml_workspace.datasets):
        train_path_on_datastore = train_data_path  # +'/*.csv'
        train_ds_data_path = [(data_store, train_path_on_datastore)]
        train_ds = Dataset.File.from_files(path=train_ds_data_path,
                                           validate=False)
        train_ds = train_ds.register(workspace=aml_workspace,
                                     name=train_ds_name,
                                     description='train data',
                                     tags={'format': 'CSV'},
                                     create_new_version=True)
    else:
        train_ds = Dataset.get_by_name(aml_workspace, train_ds_name)

    # Conda environment
    environment = Environment.from_conda_specification(
        "myenv", os.path.join(sources_directory_train,
                              "conda_dependencies.yml"))
    run_config = RunConfiguration()
    run_config.environment = environment

    with open(os.path.join(sources_directory_train,
                           'pipeline_config.json')) as json_file:
        pipe_param = json.load(json_file)
        for param in pipe_param['pipeline_parameter']:
            print(param)

    # Prepare pipeline parameters
    source_blob_url_param = PipelineParameter(name="source_blob_url",
                                              default_value="url")
    data_file_param = PipelineParameter(name="data_file",
                                        default_value="data_file")
    target_column_param = PipelineParameter(name="target_column",
                                            default_value="target_column")
    features_param = PipelineParameter(name="features", default_value="")

    # train_storage_connection_string = "DefaultEndpointsProtocol=https;AccountName=forecastingml8724233808;AccountKey=9o2ZH/5cLtmYmNyoHpoeKEA7Xjw0zi1fHLjI0Z0CZeQL5i4Ky2FZ9Wa6VpSYgK6uwLaHC3eamwnfEAscNTcgYw==;EndpointSuffix=core.windows.net"
    # Copy data step
    copy_step = PythonScriptStep(
        name="Copy Data",
        script_name="copy_data.py",
        arguments=[
            "--source_blob_url", source_blob_url_param,
            "--train_storage_connection_string",
            e.train_storage_connection_string, "--train_storage_container",
            e.train_storage_container, "--data_file", data_file_param,
            "--data_file_path", train_data_path
        ],
        runconfig=run_config,
        compute_target=aml_compute,
        source_directory=sources_directory_train)
    print("Step Copy Data created")

    # Prepare data step
    prepare_step = PythonScriptStep(name="Prepare Data",
                                    script_name="prepare.py",
                                    arguments=[
                                        "--data_file_path", train_data_path,
                                        "--data_file", data_file_param,
                                        "--target_column", target_column_param,
                                        "--features", features_param
                                    ],
                                    runconfig=run_config,
                                    compute_target=aml_compute,
                                    source_directory=sources_directory_train)
    print("Step Prepare created")

    # Publish the pipeline
    prepare_step.run_after(copy_step)
    pipeline_steps = [copy_step, prepare_step]
    pipeline = Pipeline(workspace=aml_workspace, steps=pipeline_steps)
    pipeline._set_experiment_name
    pipeline.validate()
    published_pipeline = pipeline.publish(name=pipeline_name,
                                          description="Prepare Data pipeline",
                                          version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    # Run the pipelines
    runs = []
    for param in pipe_param['pipeline_parameter']:
        # pipeline_parameters = {"model_name": "nyc_energy_model", "build_id": build_id}
        target_column = param['automl_settings']['label_column_name']
        param.pop('automl_settings')
        param.update({"target_column":
                      target_column})  # Special process target_column
        print(param)
        pipeline_run = published_pipeline.submit(aml_workspace,
                                                 e.experiment_name, param)
        runs.append(pipeline_run)
        print("Pipeline run initiated ", pipeline_run.id)

    # Wait for all runs to finish
    wait(lambda: are_all_runs_finished(runs),
         timeout_seconds=3600,
         sleep_seconds=5,
         waiting_for="all runs are finished")
    print("All prepare data pipeline runs done")
Ejemplo n.º 3
0
#     runconfig=run_config,
#     inputs=[jsonconfigs],
#     # outputs=[jsonconfigs],
#     allow_reuse=False,
# )
# print("Packed the model into a Scoring Image")

# Create Steps dependency such that they run in sequence
evaluate.run_after(train)
register_model.run_after(evaluate)
#package_model.run_after(register_model)

steps = [register_model]

# Build Pipeline
pipeline1 = Pipeline(workspace=ws, steps=steps)
print("Pipeline is built")

# Validate Pipeline
pipeline1.validate()
print("Pipeline validation complete")

# Submit unpublished pipeline with small data set for test
if args.pipeline_action == "pipeline-test":
    pipeline_run1 = Experiment(ws,
                               experiment_name).submit(pipeline1,
                                                       regenerate_outputs=True)
    print("Pipeline is submitted for execution")
    pipeline_run1.wait_for_completion(show_output=True)

# RunDetails(pipeline_run1).show()
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    pipeline_parameters = {
        "model_name": model_name_param,
        "dataset_version": dataset_version_param,
        "data_file_path": data_file_path_param,
        "caller_run_id": caller_run_id_param,
    }

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # This call creates an example CSV from sklearn sample data. If you
        # have already bootstrapped your project, you can comment this line
        # out and use your own CSV.
        create_sample_data_csv()

        # Use a CSV to read in the data set.
        file_name = "diabetes.csv"

        if not os.path.exists(file_name):
            raise Exception(
                'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.'  # NOQA: E501
                % file_name)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data/"
        datatstore.upload_files(
            files=[file_name],
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="diabetes training data",
            tags={"format": "CSV"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    # List of pipeline steps
    steps = list()

    train_step = TrainStep(workspace=aml_workspace,
                           env=e,
                           compute=aml_compute,
                           config=run_config,
                           pipeline_parameters=pipeline_parameters,
                           output_pipelinedata=pipeline_data)
    train_step.append_step(steps)

    print("Step Train created")

    if (e.run_evaluation).lower() == "true":
        evaluate_step = EvaluateStep(workspace=aml_workspace,
                                     env=e,
                                     compute=aml_compute,
                                     config=run_config,
                                     pipeline_parameters=pipeline_parameters)
        evaluate_step.append_step(steps)

        print("Include evaluation step before register step.")
        print("Step Evaluate created")
    else:
        print("Exclude evaluation step and directly run register step.")

    register_step = RegisterStep(workspace=aml_workspace,
                                 env=e,
                                 compute=aml_compute,
                                 config=run_config,
                                 pipeline_parameters=pipeline_parameters,
                                 input_pipelinedata=pipeline_data)
    register_step.append_step(steps)

    print("Step Register created")

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Ejemplo n.º 5
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob',
            'azureml-dataprep'
        ]))
    run_config.environment.docker.enabled = True
    config_envvar = {}
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        config_envvar["BUILDURI_BASE"] = builduri_base
    run_config.environment.environment_variables = config_envvar

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)
    hyperparameter_alpha_param = PipelineParameter(name="hyperparameter_alpha",
                                                   default_value=0.5)

    dataset_name = ""
    if (e.datastore_name is not None and e.datafile_name is not None):
        dataset_name = e.dataset_name
        datastore = Datastore.get(aml_workspace, e.datastore_name)
        data_path = [(datastore, e.datafile_name)]
        dataset = Dataset.Tabular.from_delimited_files(path=data_path)
        dataset.register(workspace=aml_workspace,
                         name=e.dataset_name,
                         description="dataset with training data",
                         create_new_version=True)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--alpha",
            hyperparameter_alpha_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
                              args.storage_account, args.storage_key)

    # get compute
    compute = get_compute(ws, args.compute_target)

    # prep step
    pdata, pstep = process_step(datastore, compute, args.datastore_path)

    # train step
    tdata, tstep = train_step(datastore, pdata, compute)

    # register step (tag model with version)
    rdata, rstep = register_step(datastore, tdata, compute,
                                 args.universal_package)

    # create pipeline from steps
    seer_pipeline = Pipeline(workspace=ws, steps=[pstep, tstep, rstep])
    published_pipeline = seer_pipeline.publish(
        name="Seer Pipeline",
        description="Transfer learned image classifier. Uses folders as labels."
    )

    # add pipeline to endpoint
    endpoint = add_endpoint(ws, published_pipeline, 'seer-endpoint')

    # run pipeline
    pipeline_run = endpoint.submit('seer')
    pipeline_run.set_tags(
        tags={'universalPackageVersion': args.universal_package})
    print(f'Run created with ID: {pipeline_run.id}')
Ejemplo n.º 7
0
    environment=batch_env,
    compute_target=compute_target,
    process_count_per_node=PipelineParameter(name="process_count_param",
                                             default_value=2),
    node_count=2)

from azureml.pipeline.steps import ParallelRunStep
from datetime import datetime

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

from azureml.pipeline.steps import ParallelRunStep

parallelrun_step = ParallelRunStep(name=parallel_step_name,
                                   parallel_run_config=parallel_run_config,
                                   inputs=[input_mnist_ds_consumption],
                                   output=output_dir,
                                   allow_reuse=True)

from azureml.pipeline.core import Pipeline
from azureml.core.experiment import Experiment

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
experiment = Experiment(ws, 'batch_scoring')
pipeline_run = experiment.submit(pipeline)

#from azureml.widgets import RunDetails
#RunDetails(pipeline_run).show()

pipeline_run.wait_for_completion(show_output=True)
Ejemplo n.º 8
0
from azureml.pipeline.steps import DatabricksStep
from azureml.core.databricks import PyPiLibrary

notebook_path="/Users/[email protected]/ModelTraining" # Databricks notebook path

dbNbStep = DatabricksStep(
    name="DBNotebookInWS",
    inputs=[datasetFilePath],
    outputs=[output],
    num_workers=1,
    notebook_path=notebook_path,
    run_name='DB_Notebook_demo',
    compute_target=databricks_compute,
    allow_reuse=True,
    spark_version="7.2.x-scala2.12",
    pypi_libraries=[PyPiLibrary(package = 'scikit-learn')
                    ,PyPiLibrary(package = 'azureml-sdk')
                    ,PyPiLibrary(package = 'lightgbm')
                    ,PyPiLibrary(package = 'pandas')],
    node_type = "Standard_D13_v2"
)

steps = [dbNbStep]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_Notebook_demo').submit(pipeline)
pipeline_run.wait_for_completion()

# COMMAND ----------

from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()
Ejemplo n.º 9
0
def main():
    # Replace this with the Use of Service Principal for authenticating with the Workspace
    ws = Workspace.from_config()

    # Choose a name for your CPU cluster
    cpu_cluster_name = "cpu-cluster"

    # Verify that cluster does not exist already
    try:
        cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
        print('Found existing cluster, use it.')
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D2_V2', max_nodes=4)
        cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name,
                                           compute_config)

    cpu_cluster.wait_for_completion(show_output=True)

    # Run configuration for R
    rc = RunConfiguration()
    rc.framework = "R"

    # Run configuration for python
    py_rc = RunConfiguration()
    py_rc.framework = "Python"
    py_rc.environment.docker.enabled = True

    # Combine GitHub and Cran packages for R env
    rc.environment.r = RSection()
    rc.environment.docker.enabled = True

    # Upload iris data to the datastore
    # target_path = "iris_data"
    # upload_files_to_datastore(ds,
    #                         list("./iris.csv"),
    #                         target_path = target_path,
    #                         overwrite = TRUE)

    training_data = DataReference(
        datastore=ws.get_default_datastore(),
        data_reference_name="iris_data",
        path_on_datastore="iris_data/iris.csv",
    )

    print('Succesfull')
    print(training_data)

    # PipelineData object for newly trained model
    # trained_model_dir = PipelineData(
    #     name="trained_model", datastore=ws.get_default_datastore(), is_directory=True
    # )

    # Train and Register the model
    train_step = RScriptStep(
        script_name="train.R",
        arguments=[training_data],
        inputs=[training_data],
        compute_target=cpu_cluster_name,
        source_directory=".",
        runconfig=rc,
        allow_reuse=True,
    )

    # Deploy the trained model

    print("Step Train created")

    steps = [train_step]

    train_pipeline = Pipeline(workspace=ws, steps=steps)
    train_pipeline.validate()
    pipeline_run = Experiment(ws, 'iris_training').submit(train_pipeline)
    pipeline_run.wait_for_completion(show_output=True)

    published_pipeline = train_pipeline.publish(
        name="iris-train",
        description="Model training/retraining pipeline",
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Ejemplo n.º 10
0
    print('Error while retrieving compute', e)
    sys.exit(-1)


################################
# If you want to use datastore
################################
# from azureml.core import Datastore
# from azureml.data.data_reference import DataReference
# from azureml.pipeline.core import PipelineData

# def_blob_store = Datastore(ws, "workspaceblobstore")

# input_data = DataReference(
#     datastore=def_blob_store,
#     data_reference_name="input_data",
#     path_on_datastore="20newsgroups/20news.pkl")

# output = PipelineData("output", datastore=def_blob_store)


est_step = EstimatorStep(name="Estimator_Train", 
                         estimator=est, 
                         estimator_entry_script_arguments=["--datadir", input_data, "--output", output],
                         runconfig_pipeline_params=None, 
                         inputs=[input_data], 
                         outputs=[output], 
                         compute_target=compute_target)

pipeline = Pipeline(workspace=ws, steps=[est_step])
pipeline_run = experiment.submit(pipeline)
Ejemplo n.º 11
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = "AML-RG-" + os.environ.get("BASE_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'],
        pip_packages=[
            'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob',
            'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0'
        ]))
    run_config.environment.docker.enabled = True

    datastore_name = 'tfworld'
    container_name = 'azureml-blobstore-7c6bdd88-21fa-453a-9c80-16998f02935f'
    account_name = 'tfworld6818510241'
    sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2019-11-08T05:12:15Z&st=2019-10-23T20:12:15Z&spr=https&sig=eDqnc51TkqiIklpQfloT5vcU70pgzDuKb5PAGTvCdx4%3D'  # noqa: E501

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token
                                           )

    azure_dataset = Dataset.File.from_files(
        path=(existing_datastore, 'azure-service-classifier/data'))
    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('input_data1').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(name="max_seq_length",
                                       default_value=128)
    learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(name="num_epochs", default_value=3)
    export_dir = PipelineParameter(name="export_dir",
                                   default_value="./outputs/exports")
    batch_size = PipelineParameter(name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(name="steps_per_epoch",
                                        default_value=100)

    # initialize the TensorFlow estimator
    estimator = TensorFlow(source_directory=sources_directory_train,
                           entry_script=train_script_path,
                           compute_target=aml_compute,
                           framework_version='2.0',
                           use_gpu=True,
                           pip_packages=[
                               'transformers==2.0.0',
                               'azureml-dataprep[fuse,pandas]==1.1.22'
                           ])

    train_step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            "--data_dir", input_data, "--max_seq_length", max_seq_length,
            "--learning_rate", learning_rate, "--num_epochs", num_epochs,
            "--export_dir", export_dir, "--batch_size", batch_size,
            "--steps_per_epoch", steps_per_epoch
        ],
        compute_target=aml_compute,
        inputs=[input_data],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name",
            model_name,
            "--build_id",
            build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
        workspace=aml_workspace,
        experiment_name=experiment_name)

    # Get AKS cluster for deployment
    aks_compute = get_aks(aml_workspace, aks_name)
    if aks_compute is not None:
        print(aks_compute)
Ejemplo n.º 12
0
step1 = PythonScriptStep(name = 'prepare data',
                         source_directory = 'scripts',
                         script_name = 'data_prep.py',
                         compute_target = 'aml-cluster')

# Step to train a model
step2 = PythonScriptStep(name = 'train model',
                         source_directory = 'scripts',
                         script_name = 'train_model.py',
                         compute_target = 'aml-cluster')

from azureml.pipeline.core import Pipeline
from azureml.core import Experiment

# Construct the pipeline
train_pipeline = Pipeline(workspace = ws, steps = [step1,step2])

# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'training-pipeline')
pipeline_run = experiment.submit(train_pipeline)


#Pass Data bn Pipeline steps w PipelineData obj
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep

# Get a dataset for the initial data
raw_ds = Dataset.get_by_name(ws, 'raw_dataset')

# Define a PipelineData object to pass data between steps
data_store = ws.get_default_datastore()
Ejemplo n.º 13
0
        "owner='Neelabh Kashyap'",
        "--tag",
        "team='Data Science'",
        "--tag",
        "comment=prod",
    ],
    inputs=[model_outpath],
    compute_target=compute_target,
    source_directory=".",
    runconfig=py_rc,
    allow_reuse=False,
)

pipeline = Pipeline(
    workspace=ws,
    steps=[download_step, train_step, register_model],
    description="Builds R model for iris dataset",
)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--publish", action="store_true")

    args = parser.parse_args()

    if args.publish:
        p = pipeline.publish(
            name="iris-classifier-train-r",
            description="train a classifer on iris dataset and register model",
        )
def build_prednet_pipeline(dataset, ws):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    base_dir = "."

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = "./scripts"
    os.makedirs(script_folder, exist_ok=True)

    shutil.copytree(os.path.join(base_dir, "models"),
                    os.path.join(base_dir, script_folder, "models"))
    shutil.copy(os.path.join(base_dir, "train.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder)

    cpu_compute_name = args.cpu_compute_name
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = args.gpu_compute_name

    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print(gpu_compute_target.get_status().serialize())

    env = Environment.get(ws, "prednet")

    # Runconfigs
    runconfig = RunConfiguration()
    runconfig.environment = env
    print("PipelineData object created")

    # DataReference to where raw data is stored.
    raw_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="raw_data",
        path_on_datastore=os.path.join("prednet", "data", "raw_data"),
    )
    print("DataReference object created")

    # Naming the intermediate data as processed_data and assigning it to the
    # variable processed_data.
    preprocessed_data = PipelineData("preprocessed_data",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store)
    # prednet_path = PipelineData("outputs", datastore=def_blob_store)
    scored_data = PipelineData("scored_data", datastore=def_blob_store)
    model_path = PipelineData("model_path", datastore=def_blob_store)

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name="prepare_data",
        script_name="data_preparation.py",
        arguments=[
            "--raw_data",
            raw_data,
            "--preprocessed_data",
            preprocessed_data,
            "--dataset",
            dataset,
        ],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    # data_prep.run_after(video_decoding)

    print("data_prep step created")

    est = Estimator(
        source_directory=script_folder,
        compute_target=gpu_compute_target,
        entry_script="train.py",
        node_count=1,
        environment_definition=env,
    )

    ps = BayesianParameterSampling({
        "--batch_size":
        choice(1, 2, 4, 10),
        "--filter_sizes":
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        "--stack_sizes":
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),
        "--learning_rate":
        uniform(1e-6, 1e-3),
        "--lr_decay":
        uniform(1e-9, 1e-2),
        "--freeze_layers":
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
        # "--fine_tuning": choice("True", "False"),
    })

    hdc = HyperDriveConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        primary_metric_name="val_loss",
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=3,
        max_concurrent_runs=3,
        max_duration_minutes=60 * 6,
    )

    train_prednet = HyperDriveStep(
        "train_w_hyperdrive",
        hdc,
        estimator_entry_script_arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--remote_execution",
            "--dataset",
            dataset,
        ],
        inputs=[preprocessed_data],
        outputs=[hd_child_cwd],
        metrics_output=data_metrics,
        allow_reuse=True,
    )
    train_prednet.run_after(data_prep)

    register_prednet = PythonScriptStep(
        name="register_prednet",
        script_name="register_prednet.py",
        arguments=[
            "--data_metrics",
            data_metrics,
        ],
        compute_target=cpu_compute_target,
        inputs=[data_metrics, hd_child_cwd],
        source_directory=script_folder,
        allow_reuse=True,
    )
    register_prednet.run_after(train_prednet)

    batch_scoring = PythonScriptStep(
        name="batch_scoring",
        script_name="batch_scoring.py",
        arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--scored_data",
            scored_data,
            "--dataset",
            dataset,
            # "--prednet_path",
            # prednet_path
        ],
        compute_target=gpu_compute_target,
        inputs=[preprocessed_data],
        outputs=[scored_data],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    batch_scoring.run_after(register_prednet)

    train_clf = PythonScriptStep(
        name="train_clf",
        script_name="train_clf.py",
        arguments=[
            "--preprocessed_data", preprocessed_data, "--scored_data",
            scored_data, "--model_path", model_path
        ],
        compute_target=cpu_compute_target,
        inputs=[preprocessed_data, scored_data],
        outputs=[model_path],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    train_clf.run_after(batch_scoring)

    register_clf = PythonScriptStep(
        name="register_clf",
        script_name="register_clf.py",
        arguments=["--model_path", model_path],
        inputs=[model_path],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        allow_reuse=True,
        runconfig=runconfig,
    )
    register_clf.run_after(train_clf)

    pipeline = Pipeline(
        workspace=ws,
        steps=[
            data_prep,
            train_prednet,
            register_prednet,
            batch_scoring,
            train_clf,
            register_clf,
        ],
    )
    pipeline.validate()

    pipeline_name = "prednet_" + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)

    _ = Schedule.create(
        workspace=ws,
        name=pipeline_name + "_sch",
        pipeline_id=published_pipeline.id,
        experiment_name=pipeline_name,
        datastore=def_blob_store,
        wait_for_provisioning=True,
        description="Datastore scheduler for Pipeline" + pipeline_name,
        path_on_datastore=os.path.join("prednet/data/raw_data", dataset,
                                       "Train"),
        polling_interval=60 * 24,
    )

    published_pipeline.submit(ws, pipeline_name)
Ejemplo n.º 15
0
    step3 = PythonScriptStep(name="register model",
                             compute_target=env.aml_compute_name,
                             source_directory='src/steps',
                             script_name='03_reg_model.py',
                             inputs=[model_dir],
                             outputs=[],
                             arguments=[
                                 '--model_dir', model_dir, '--model_name',
                                 env.aml_model_name
                             ],
                             runconfig=run_config,
                             allow_reuse=False)

    # Build pipeline
    pipeline = Pipeline(workspace=ws, steps=[step3])
    pipeline.validate()

    # Publish pipeline & pipeline_endpoint
    published_pipeline = pipeline.publish(name=env.aml_pipeline_name)

    try:
        pipeline_endpoint = PipelineEndpoint.get(
            workspace=ws, name=env.aml_pipeline_endpoint_name)
        pipeline_endpoint.add_default(published_pipeline)
    except ErrorResponseException:
        pipeline_endpoint = PipelineEndpoint.publish(
            workspace=ws,
            name=env.aml_pipeline_endpoint_name,
            description=env.aml_pipeline_endpoint_name,
            pipeline=published_pipeline)
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob',
            'azure-cli', 'azure-cli-telemetry', 'azure-cli-core',
            'azure-cli-nspkg', 'azure-cli-command-modules-nspkg'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")
    storageacctname = PipelineParameter(
        name="storageacctname",
        default_value=os.environ.get("STORAGE_ACCT_NAME"))
    storageacctkey = PipelineParameter(
        name="storageacctkey",
        default_value=os.environ.get("STORAGE_ACCT_KEY"))
    containername = PipelineParameter(
        name="containername",
        default_value=os.environ.get("STORAGE_BLOB_NAME"))

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id", release_id, "--model_name", model_name,
            "--storageacctname", storageacctname, "--storageacctkey",
            storageacctkey, "--containername", containername
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    steps = [train_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Ejemplo n.º 17
0
# environment.register(ws)
environment = Environment.get(ws, "sentiment-env")

estimator = TensorFlow(
    source_directory="imdb",
    entry_script="experiment.py",
    framework_version="2.1",
    conda_packages=["python=3.7.4", "tensorflow", "tensorflow-datasets"],
    pip_packages=["azureml-sdk[notebooks,automl,explain]"],
    compute_target="archi-trainer")

model_step = EstimatorStep(
    name="training model",
    estimator=estimator,
    compute_target="archi-trainer",
    estimator_entry_script_arguments=['--n-words', 5000, '--epochs', 2])
# register_step = PythonScriptStep(name="register pipeline", source_directory="sentiment_analysis", script_name="registration.py", compute_target="dummy", runconfig=run_config)
# register_step.run_after(model_step)

sentiment_pipe = Pipeline(workspace=ws, steps=[model_step])
sentiment_pipe.validate()

experiment = Experiment(workspace=ws, name="sentiment-analysis")
run = experiment.submit(config=sentiment_pipe)

run.wait_for_completion(show_output=True)

ds.upload('outputs/sentiment_model.h5',
          'models',
          overwrite=True,
          show_progress=True)
Ejemplo n.º 18
0
    def __init__(self, request_id, use_url, input_container_sas,
                 internal_datastore, model_name):
        try:
            self.request_id = request_id

            aml_config = api_config.AML_CONFIG

            self.ws = Workspace(subscription_id=aml_config['subscription_id'],
                                resource_group=aml_config['resource_group'],
                                workspace_name=aml_config['workspace_name'],
                                auth=svc_pr)
            print('AMLCompute constructor, AML workspace obtained.')

            internal_dir, output_dir = self._get_data_references(
                request_id, internal_datastore)

            compute_target = self.ws.compute_targets[
                aml_config['aml_compute_name']]

            dependencies = CondaDependencies.create(pip_packages=[
                'tensorflow-gpu==1.12.0', 'pillow', 'numpy',
                'azure-storage-blob==2.1.0', 'azureml-defaults==1.0.41'
            ])

            amlcompute_run_config = RunConfiguration(
                conda_dependencies=dependencies)
            amlcompute_run_config.environment.docker.enabled = True
            amlcompute_run_config.environment.docker.gpu_support = True
            amlcompute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
            amlcompute_run_config.environment.spark.precache_packages = False

            # default values are required and need to be literal values or data references as JSON
            param_job_id = PipelineParameter(name='param_job_id',
                                             default_value='default_job_id')

            param_begin_index = PipelineParameter(name='param_begin_index',
                                                  default_value=0)
            param_end_index = PipelineParameter(name='param_end_index',
                                                default_value=0)

            param_detection_threshold = PipelineParameter(
                name='param_detection_threshold', default_value=0.05)

            batch_score_step = PythonScriptStep(
                aml_config['script_name'],
                source_directory=aml_config['source_dir'],
                hash_paths=['.'],  # include all contents of source_directory
                name='batch_scoring',
                arguments=[
                    '--job_id',
                    param_job_id,
                    '--request_id',
                    request_id,
                    '--model_name',
                    model_name,
                    '--input_container_sas',
                    input_container_sas,  # can be None
                    '--use_url',
                    use_url,
                    '--internal_dir',
                    internal_dir,
                    '--begin_index',
                    param_begin_index,  # inclusive
                    '--end_index',
                    param_end_index,  # exclusive
                    '--output_dir',
                    output_dir,
                    '--detection_threshold',
                    param_detection_threshold
                ],
                compute_target=compute_target,
                inputs=[internal_dir],
                outputs=[output_dir],
                runconfig=amlcompute_run_config)
            self.pipeline = Pipeline(workspace=self.ws,
                                     steps=[batch_score_step])
            self.aml_config = aml_config
            print('AMLCompute constructor all good.')
        except Exception as e:
            raise RuntimeError(
                'Error in setting up AML Compute resource: {}.'.format(str(e)))
Ejemplo n.º 19
0
                                          "--input",
                                          ds_input.as_download(), "--output",
                                          prepared_data
                                      ],
                                      inputs=[ds_input],
                                      outputs=[prepared_data],
                                      compute_target=compute_target,
                                      runconfig=aml_run_config,
                                      allow_reuse=True)

    # Step 2: Train our model
    training_results = PipelineData(name="training_results",
                                    datastore=blob_store)

    train_step = PythonScriptStep(script_name=train_entry_point,
                                  source_directory=train_source_dir,
                                  arguments=[
                                      "--prepped_data",
                                      prepared_data.as_input(),
                                      "--training_results", training_results
                                  ],
                                  compute_target=compute_target,
                                  runconfig=aml_run_config,
                                  allow_reuse=True)

    # Step 3: Build the pipeline
    pipeline_steps = [data_prep_step, train_step]
    pipeline = Pipeline(workspace=workspace, steps=[pipeline_steps])

    pipeline_run1 = Experiment(workspace, "mckee_bot").submit(pipeline)
    pipeline_run1.wait_for_completion()
Ejemplo n.º 20
0
est = Estimator(source_directory='.',
                compute_target=ws.compute_targets['cpu'],
                entry_script='azureml-issues.py',
                pip_packages=['azure-devops', 'pandas'])

data_processing = EstimatorStep(estimator=est,
                                estimator_entry_script_arguments=[
                                    '--data_path', blob_output_data,
                                    '--analyze', '--load_open', '--load_closed'
                                ],
                                inputs=[blob_output_data],
                                compute_target=ws.compute_targets['cpu'],
                                allow_reuse=False)

pipeline = Pipeline(workspace=ws, steps=[data_processing])
print("Pipeline is built")

pipeline.validate()
print("Simple validation complete")

pipeline_run = Experiment(ws, 'issues_pipeline').submit(pipeline)
print("Pipeline is submitted for execution")

published_pipeline = pipeline.publish(
    name="Issues_Stats",
    description="Pull data from DevOps and aggregate for PowerBI")
print(published_pipeline.id)

from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("RESOURCE_GROUP")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu)
    if aml_compute_cpu is not None:
        print(aml_compute_cpu)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    def_blob_store = Datastore(aml_workspace, "workspaceblobstore")
    jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store)
    config_suffix = datetime.datetime.now().strftime("%Y%m%d%H")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        # inputs=[jsonconfigs],
        outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_model_step = PythonScriptStep(
        name="Register New Trained Model",
        script_name=register_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step register model created")

    evaluate_step.run_after(train_step)
    register_model_step.run_after(evaluate_step)
    steps = [register_model_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name="training-pipeline",
        description="Model training/retraining pipeline")

    train_pipeline_json = {}
    train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint
    json_file_path = "ml_service/pipelines/train_pipeline.json"
    with open(json_file_path, "w") as outfile:
        json.dump(train_pipeline_json, outfile)
Ejemplo n.º 22
0
    name='Train Estimator Step',
    estimator=trainEstimator,
    inputs=[training_data_location],
    outputs=[model],
    compute_target=computeCluster,
    estimator_entry_script_arguments = script_params
)

# == Step 3 ==
model_name = "MargeOrHomer"

registerStep = PythonScriptStep(name="Register model for deployment",
                            script_name="register.py",
                            compute_target=computeCluster,
                            inputs=[model],
                            arguments=['--dataset_name', model_name,
                                       '--model_assets_path', model
                                      ],
                            source_directory=script_folder)

# Create the pipeline
prep_train_register = [preProcessDataStep,trainOnGpuStep,registerStep]
pipeline = Pipeline(workspace=ws, steps=[prep_train_register])
pipeline.validate()

# Publish the pipeline
mlpipeline = pipeline.publish(name="Marge Or Homer - Training pipeline")
print("Pipeline Published ID:"+mlpipeline.id)

# Submit the pipeline to be run
mlpipeline.submit(ws,"Marge-or-Homer" ,pipeline_parameters={"source_dataset":DataPath(datastore=source_ds, path_on_datastore="trainingdata")})
Ejemplo n.º 23
0
                                arguments = ["--input_directory", anomaly_data],
                                inputs = [anomaly_data],
                                # outputs = [model],
                                compute_target = aml_compute, 
                                source_directory = project_folder,
                                allow_reuse = True,
                                runconfig = amlcompute_run_config)

print("AutoML Training Step created.")

############################### set up, validate and run pipeline

steps = [anom_detect, automl_step]
print("Step lists created")

pipeline = Pipeline(workspace = ws, steps = steps)
print ("Pipeline is built")

pipeline.validate()
print("Pipeline validation complete")

pipeline_run = experiment.submit(pipeline) #, regenerate_outputs=True)
print("Pipeline is submitted for execution")

# Wait until the run finishes.
pipeline_run.wait_for_completion(show_output = False)
print("Pipeline run completed")

############################### upload artifacts to AML Workspace

# Download aml_config info and output of automl_step
def get_scoring_pipeline(
    scoring_dataset: Dataset,
    output_loc: PipelineData,
    score_run_config: ParallelRunConfig,
    copy_run_config: RunConfiguration,
    computetarget: ComputeTarget,
    ws: Workspace,
    env: Env,
) -> Pipeline:
    """
    Creates the scoring pipeline.

    :param scoring_dataset: Data to score
    :param output_loc: Location to save the scoring results
    :param score_run_config: Parallel Run configuration to support
    parallelized scoring
    :param copy_run_config: Script Run configuration to support
    score copying
    :param computetarget: AML Compute target
    :param ws: AML Workspace
    :param env: Environment Variables

    :returns: Scoring pipeline instance
    """
    # To help filter the model make the model name, model version and a
    # tag/value pair bindable parameters so that they can be passed to
    # the pipeline when invoked either over REST or via the AML SDK.
    model_name_param = PipelineParameter("model_name",
                                         default_value=" ")  # NOQA: E501
    model_version_param = PipelineParameter("model_version",
                                            default_value=" ")  # NOQA: E501
    model_tag_name_param = PipelineParameter("model_tag_name",
                                             default_value=" ")  # NOQA: E501
    model_tag_value_param = PipelineParameter("model_tag_value",
                                              default_value=" ")  # NOQA: E501

    scoring_step = ParallelRunStep(
        name="scoringstep",
        inputs=[scoring_dataset],
        output=output_loc,
        arguments=[
            "--model_name",
            model_name_param,
            "--model_version",
            model_version_param,
            "--model_tag_name",
            model_tag_name_param,
            "--model_tag_value",
            model_tag_value_param,
        ],
        parallel_run_config=score_run_config,
        allow_reuse=False,
    )

    copying_step = PythonScriptStep(
        name="scorecopystep",
        script_name=env.batchscore_copy_script_path,
        source_directory=env.sources_directory_train,
        arguments=[
            "--output_path",
            output_loc,
            "--scoring_output_filename",
            env.scoring_datastore_output_filename
            if env.scoring_datastore_output_filename is not None else "",
            "--scoring_datastore",
            env.scoring_datastore_storage_name
            if env.scoring_datastore_storage_name is not None else "",
            "--score_container",
            env.scoring_datastore_output_container
            if env.scoring_datastore_output_container is not None else "",
            "--scoring_datastore_key",
            env.scoring_datastore_access_key
            if env.scoring_datastore_access_key is not None else "",
        ],
        inputs=[output_loc],
        allow_reuse=False,
        compute_target=computetarget,
        runconfig=copy_run_config,
    )
    return Pipeline(workspace=ws, steps=[scoring_step, copying_step])
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=e.rebuild_env)  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if (dataset_name not in aml_workspace.datasets):
        # Create dataset from diabetes sample data
        sample_data = load_diabetes()
        df = pd.DataFrame(data=sample_data.data,
                          columns=sample_data.feature_names)
        df['Y'] = sample_data.target
        file_name = 'diabetes.csv'
        df.to_csv(file_name, index=False)

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = 'training-data/'
        datatstore.upload_files(files=[file_name],
                                target_path=target_path,
                                overwrite=True,
                                show_progress=False)

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(workspace=aml_workspace,
                                   name=dataset_name,
                                   description='diabetes training data',
                                   tags={'format': 'CSV'},
                                   create_new_version=True)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
environment.python.conda_dependencies.add_pip_package("scipy")
environment.python.conda_dependencies.add_pip_package("joblib")
environment.python.conda_dependencies.add_pip_package("numpy")
environment.python.conda_dependencies.add_pip_package("pandas")


# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = environment

print ("Run configuration created.")
pipeline_param = PipelineParameter(
  name="data",
  default_value=default)
register_step = PythonScriptStep(name = "Register Model",
                                script_name = "pipelineScript.py",
                                arguments = ['--data', pipeline_param],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)
reg_pipeline=Pipeline(workspace=ws,steps=[register_step])


# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'RegExp3').submit(reg_pipeline)
pipeline_run1.wait_for_completion()
Ejemplo n.º 27
0
def main():
    cluster_id = os.environ.get("DATABRICKS_CLUSTER_ID", None)
    workspace_name = os.environ.get("AML_WORKSPACE_NAME", None)
    resource_group = os.environ.get("RESOURCE_GROUP", None)
    subscription_id = os.environ.get("SUBSCRIPTION_ID", None)
    tenant_id = os.environ.get("TENANT_ID", None)
    app_id = os.environ.get("SP_APP_ID", None)
    app_secret = os.environ.get("SP_APP_SECRET", None)
    experiment_subfolder = os.environ.get(
        "EXPERIMENT_FOLDER",
        'aml_service/experiment'
    )
    sources_directory = os.environ.get("SOURCES_DIR", None)
    experiment_folder = os.path.join(sources_directory, experiment_subfolder)
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH", None)
    databricks_workspace_name = os.environ.get(
        "DATABRICKS_WORKSPACE_NAME",
        None
    )
    databricks_access_token = os.environ.get("DATABRICKS_ACCESS_TOKEN", None)
    databricks_compute_name_aml = os.environ.get(
        "DATABRICKS_COMPUTE_NAME_AML",
        None
    )
    model_dir = os.environ.get("MODEL_DIR", '/dbfs/model')
    model_name = os.environ.get("MODEL_NAME", 'local-model')

    model_file_name = "%s.pth" % (model_name)
    model_path = os.path.join(model_dir, model_file_name)

    print("The model path will be %s" % (model_path))

    aml_workspace = get_workspace(
        workspace_name,
        resource_group,
        subscription_id,
        tenant_id,
        app_id,
        app_secret)
    print(aml_workspace)

    databricks_compute = get_compute(
        aml_workspace,
        databricks_compute_name_aml,
        resource_group,
        databricks_workspace_name,
        databricks_access_token)
    print(databricks_compute)

    step1 = DatabricksStep(
        name="DBPythonInLocalMachine",
        num_workers=1,
        python_script_name=train_script_path,
        source_directory=sources_directory,
        run_name='DB_Python_Local_demo',
        existing_cluster_id=cluster_id,
        compute_target=databricks_compute,
        allow_reuse=False,
        python_script_params=['--MODEL_PATH', model_path]
    )

    step2 = DatabricksStep(
        name="RegisterModel",
        num_workers=1,
        python_script_name="register_model.py",
        source_directory=experiment_folder,
        run_name='Register_model',
        existing_cluster_id=cluster_id,
        compute_target=databricks_compute,
        allow_reuse=False,
        python_script_params=[
            '--MODEL_PATH', model_path,
            '--TENANT_ID', tenant_id,
            '--APP_ID', app_id,
            '--APP_SECRET', app_secret,
            '--MODEL_NAME', model_name]
    )

    step2.run_after(step1)
    print("Step lists created")

    pipeline = Pipeline(
        workspace=aml_workspace,
        # steps=[step1])
        steps=[step1, step2])
    print("Pipeline is built")

    pipeline.validate()
    print("Pipeline validation complete")

    pipeline_run = pipeline.submit(experiment_name="pipetest")

    print("Pipeline is submitted for execution")

    pipeline_details = pipeline_run.get_details()

    pipeline_run_id = pipeline_details['runId']

    azure_run_url = get_experiment_run_url(
        subscription_id,
        resource_group,
        workspace_name,
        pipeline_run_id
    )

    print("To check details of the Pipeline run, go to " + azure_run_url)

    pipeline_status = pipeline_run.get_status()

    timer_mod = 0

    while pipeline_status == 'Running' or pipeline_status == 'NotStarted':
        timer_mod = timer_mod + 10
        time.sleep(10)
        if (timer_mod % 30) == 0:
            print(
                "Status: %s. %s seconds have passed." %
                (pipeline_status, timer_mod)
            )
        pipeline_status = pipeline_run.get_status()

    if pipeline_status == 'Failed':
        print("AML Pipelne failed. Check %s for details." % (azure_run_url))
        sys.exit(1)
    else:
        print(pipeline_status)

    print("Pipeline completed")
environment.python.conda_dependencies.add_pip_package("numpy")
environment.python.conda_dependencies.add_pip_package("pandas")

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above.
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = environment

print("Run configuration created.")
pipeline_param = PipelineParameter(name="data", default_value=default)
register_step = PythonScriptStep(name="Register Model",
                                 script_name="pipelineScript.py",
                                 arguments=['--data', pipeline_param],
                                 compute_target=pipeline_cluster,
                                 runconfig=pipeline_run_config,
                                 allow_reuse=False)
service_step = PythonScriptStep(
    name="CreateService",
    script_name="serviceCreation_PipelineScript.py",
    compute_target=pipeline_cluster,
    runconfig=pipeline_run_config,
    allow_reuse=False)
succ_pipeline = Pipeline(workspace=ws, steps=[register_step, service_step])

# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'RegExp3').submit(succ_pipeline)
pipeline_run1.wait_for_completion()
Ejemplo n.º 29
0
# step 2, submit pipelines
submit_pipelines = PythonScriptStep(
    name='submit pipelines',
    script_name="pipelines_submit.py",
    # arguments=["--overwrite_published_pipelines", overwrite_published_pipelines],
    compute_target=cpu_compute_target,
    source_directory=script_folder,
    runconfig=cpu_compute_run_config,
    allow_reuse=False,
    hash_paths=["."])
print("pipeline submit step created")

submit_pipelines.run_after(build_pipelines)

pipeline = Pipeline(workspace=ws, steps=[build_pipelines, submit_pipelines])
print("Pipeline created")

pipeline.validate()
print("Validation complete")

pipeline_name = 'prednet_master'
published_pipeline = pipeline.publish(name=pipeline_name)
print("pipeline id: ", published_pipeline.id)

datastore = ws.get_default_datastore()

schedule = Schedule.create(workspace=ws,
                           name=pipeline_name + "_sch",
                           pipeline_id=published_pipeline.id,
                           experiment_name='Schedule_Run',
Ejemplo n.º 30
0
def main(cluster_id, workspace_name, resource_group, subscription_id,
         tenant_id, app_id, app_secret, experiment_folder, project_folder,
         train_script_path, databricks_workspace, databricks_access_token,
         databricks_compute_name, mdl_dir, mdl_name):
    mdl_file_name = "%s.pth" % (mdl_name)
    mdl_path = os.path.join(mdl_dir, mdl_file_name)

    print("The model path will be %s" % (mdl_path))

    ws = get_workspace(workspace_name, resource_group, subscription_id,
                       tenant_id, app_id, app_secret)
    print(ws)

    databricks_compute = get_compute(ws, databricks_compute_name,
                                     resource_group, databricks_workspace,
                                     databricks_access_token)
    print(databricks_compute)

    step1 = DatabricksStep(name="DBPythonInLocalMachine",
                           num_workers=1,
                           python_script_name=train_script_path,
                           source_directory=project_folder,
                           run_name='DB_Python_Local_demo',
                           existing_cluster_id=cluster_id,
                           compute_target=databricks_compute,
                           allow_reuse=False,
                           python_script_params=['--MODEL_PATH', mdl_path])

    step2 = DatabricksStep(name="RegisterModel",
                           num_workers=1,
                           python_script_name="Register.py",
                           source_directory=experiment_folder,
                           run_name='Register_model',
                           existing_cluster_id=cluster_id,
                           compute_target=databricks_compute,
                           allow_reuse=False,
                           python_script_params=[
                               '--MODEL_PATH', mdl_path, '--TENANT_ID',
                               tenant_id, '--APP_ID', app_id, '--APP_SECRET',
                               app_secret, '--MODEL_NAME', mdl_name
                           ])

    step2.run_after(step1)
    print("Step lists created")

    pipeline = Pipeline(
        workspace=ws,
        # steps=[step1])
        steps=[step1, step2])
    print("Pipeline is built")

    pipeline.validate()
    print("Pipeline validation complete")

    pipeline_run = pipeline.submit(experiment_name="pipetest")

    print("Pipeline is submitted for execution")

    pipeline_details = pipeline_run.get_details()

    pipeline_run_id = pipeline_details['runId']

    azure_run_url = get_experiment_run_url(subscription_id, resource_group,
                                           workspace_name, pipeline_run_id)

    print("To check details of the Pipeline run, go to " + azure_run_url)

    pipeline_status = pipeline_run.get_status()

    timer_mod = 0

    while (pipeline_status == 'Running'):
        timer_mod = timer_mod + 10
        time.sleep(10)
        if ((timer_mod % 30) == 0):
            print("Still running. %s seconds have passed." % (timer_mod))
        pipeline_status = pipeline_run.get_status()

    if pipeline_status == 'Failed':
        sys.exit("AML Pipeline failed")
    else:
        print(pipeline_status)

    print("Pipeline completed")