Esempio n. 1
0
def get_run_configs(workspace, compute_target, env):
    environment = get_environment(workspace=workspace,
                                  env_name=env.aml_env_scoring_name,
                                  conda_dependencies=env.aml_conda_score_file,
                                  create_new=env.rebuild_scoring_env,
                                  enable_docker=True,
                                  use_gpu=env.use_gpu_for_scoring)
    score_run_config = ParallelRunConfig(
        environment=environment,
        entry_script=env.batchscore_script_path,
        source_directory=env.source_train_directory,
        error_threshold=10,
        output_action='append_row',
        compute_target=compute_target,
        node_count=env.max_nodes_scoring,
        run_invocation_timeout=300)
    copy_run_config = RunConfiguration()
    copy_run_config.environment = get_environment(
        workspace=workspace,
        env_name=env.aml_env_scorecopy_name,
        conda_dependencies=env.aml_conda_scorecopy_file,
        create_new=env.rebuild_scoring_env,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring)

    return score_run_config, copy_run_config
Esempio n. 2
0
def build_parallel_run_config(source_directory, train_env, compute, nodecount,
                              workercount, timeout):
    parallel_run_config = ParallelRunConfig(
        source_directory=source_directory,
        entry_script='train_automl.py',
        mini_batch_size="1",  # do not modify this setting
        run_invocation_timeout=timeout,
        run_max_try=3,
        error_threshold=-1,
        output_action="append_row",
        environment=train_env,
        process_count_per_node=workercount,
        compute_target=compute,
        node_count=nodecount)
    validate_parallel_run_config(parallel_run_config)
    return parallel_run_config
def build_parallel_run_config(train_env, compute, nodecount, workercount, timeout):
    from azureml.pipeline.steps import ParallelRunConfig
    from common.scripts.helper import validate_parallel_run_config
    parallel_run_config = ParallelRunConfig(
        source_directory='./automl_train/scripts',
        entry_script='train_minibatch.py',
        mini_batch_size="1",  # do not modify this setting
        run_invocation_timeout=timeout,
        error_threshold=-1,
        output_action="append_row",
        environment=train_env,
        process_count_per_node=workercount,
        compute_target=compute,
        node_count=nodecount)
    validate_parallel_run_config(parallel_run_config)
    return parallel_run_config
def get_run_configs(
    ws: Workspace, computetarget: ComputeTarget, env: Env
) -> Tuple[ParallelRunConfig, RunConfiguration]:
    """
    Creates the necessary run configurations required by the
    pipeline to enable parallelized scoring.

    :param ws: AML Workspace
    :param computetarget: AML Compute target
    :param env: Environment Variables

    :returns: Tuple[Scoring Run configuration, Score copy run configuration]
    """

    # get a conda environment for scoring
    environment = get_environment(
        ws,
        env.aml_env_name_scoring,
        conda_dependencies_file=env.aml_env_score_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )

    score_run_config = ParallelRunConfig(
        entry_script=env.batchscore_script_path,
        source_directory=env.sources_directory_train,
        error_threshold=10,
        output_action="append_row",
        compute_target=computetarget,
        node_count=env.max_nodes_scoring,
        environment=environment,
        run_invocation_timeout=300,
    )

    copy_run_config = RunConfiguration()
    copy_run_config.environment = get_environment(
        ws,
        env.aml_env_name_score_copy,
        conda_dependencies_file=env.aml_env_scorecopy_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )
    return (score_run_config, copy_run_config)
Esempio n. 5
0
print("SDK version:", azureml.core.VERSION)

dataset_name = 'grib-dataset'

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

datastore = ws.get_default_datastore()

input_ds = Dataset.get_by_name(ws, dataset_name)
batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount')

output_dir = PipelineData(name='batch_output', datastore=datastore)

parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path='convert_parallel.yml')

batch_step = ParallelRunStep(name="batch-conversion-step",
                             parallel_run_config=parallel_run_config,
                             arguments=['--data_output_path', output_dir],
                             inputs=[batch_data],
                             output=output_dir,
                             allow_reuse=False)

steps = [batch_step]

pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()

pipeline_run = Experiment(ws, 'convert-batch-pipeline').submit(pipeline)
pipeline_run.wait_for_completion()
def main():
    """Build pipeline."""
    # Environment variables
    env = Env()

    # Azure ML workspace
    aml_workspace = Workspace.get(
        name=env.workspace_name,
        subscription_id=env.subscription_id,
        resource_group=env.resource_group,
    )
    logger.info(f"Azure ML workspace: {aml_workspace}")

    # Azure ML compute cluster
    aml_compute = get_compute(aml_workspace, env.compute_name)
    logger.info(f"Aazure ML compute cluster: {aml_compute}")

    # Azure ML environment
    environment = Environment(name=env.aml_env_name)
    conda_dep = CondaDependencies(
        conda_dependencies_file_path="./local_development/dev_dependencies.yml"
    )
    environment.python.conda_dependencies = conda_dep

    run_config = RunConfiguration()
    run_config.environment = environment

    # Pipeline Data
    preparation_pipelinedata = PipelineData("preparation_pipelinedata",
                                            is_directory=True).as_dataset()
    extraction_pipelinedata = PipelineData("extraction_pipelinedata",
                                           is_directory=True)
    training_pipelinedata = PipelineData("training_pipelinedata",
                                         is_directory=True)

    # List of pipeline steps
    step_list = list()
    preparation_step = PythonScriptStep(
        name="preparation-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.preparation_step_script_path,
        outputs=[preparation_pipelinedata],
        arguments=[
            "--input_path", env.input_dir, "--output_path",
            preparation_pipelinedata, "--datastore_name",
            env.blob_datastore_name
        ],
        runconfig=run_config)

    step_list.append(preparation_step)

    parallel_run_config = ParallelRunConfig(
        source_directory=env.sources_directory_train,
        entry_script=env.extraction_step_script_path,
        mini_batch_size=env.mini_batch_size,
        error_threshold=env.error_threshold,
        output_action="append_row",
        environment=environment,
        compute_target=aml_compute,
        node_count=env.node_count,
        run_invocation_timeout=env.run_invocation_timeout,
        process_count_per_node=env.process_count_per_node,
        append_row_file_name="extraction_output.txt")

    extraction_step = ParallelRunStep(
        name="extraction-step",
        inputs=[preparation_pipelinedata],
        output=extraction_pipelinedata,
        arguments=["--output_dir", extraction_pipelinedata],
        parallel_run_config=parallel_run_config)
    step_list.append(extraction_step)

    training_step = PythonScriptStep(
        name="traning-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.training_step_script_path,
        inputs=[extraction_pipelinedata],
        outputs=[training_pipelinedata],
        arguments=[
            "--input_dir", extraction_pipelinedata, "--output_dir",
            training_pipelinedata
        ],
        runconfig=run_config)

    step_list.append(training_step)

    # Build pipeline
    pipeline = Pipeline(workspace=aml_workspace, steps=step_list)
    pipeline.validate()
    logger.info(f"Built pipeline {pipeline}")

    # Publish pipeline
    published_pipeline = pipeline.publish(
        env.pipeline_name,
        description=env.pipeline_name,
        version=datetime.utcnow().isoformat())
    try:
        pipeline_endpoint = PipelineEndpoint.get(
            workspace=aml_workspace, name=env.pipeline_endpoint_name)
        pipeline_endpoint.add_default(published_pipeline)
    except ErrorResponseException:
        pipeline_endpoint = PipelineEndpoint.publish(
            workspace=aml_workspace,
            name=env.pipeline_endpoint_name,
            pipeline=published_pipeline,
            description=env.pipeline_endpoint_name)
parser.add_argument("--runconfig",
                    type=str,
                    help="Path to the parallel runconfig for pipeline",
                    dest="runconfig",
                    required=True)
args = parser.parse_args()
print(f'Arguments: {args}')

print('Connecting to workspace')
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading parallel runconfig for pipeline')
parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path=args.runconfig)

print('Loading default batch dataset')
batch_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input and dataset output name (batch scoring result) to the pipeline
batch_dataset_parameter = PipelineParameter(name="batch_dataset",
                                            default_value=batch_dataset)
batch_dataset_consumption = DatasetConsumptionConfig(
    "batch_dataset", batch_dataset_parameter).as_mount()

datastore = ws.get_default_datastore()
output_dataset_name = "batch_scoring_results"

# Existing, GA-code - does not allow to specify the path on the datastore
# output_dataset = PipelineData(name='batch_output', datastore=datastore).as_dataset()
def get_pipeline(aml_compute: ComputeTarget, blob_ds: Datastore,
                 batch_env: Environment, tf_env: Environment) -> str:
    """
    Creates pipeline steps
    Parameters:
        aml_compute (ComputeTarget): a reference to a compute
        blob_ds (DataStore): a reference to a datastore
        batch_env (Environment): a reference to environment object
        tf_env (Environment): a horovod/tf environment
    Returns:
        string: a set of pipeline steps
    """

    # We need something to generate data by the way
    pipeline_files = PipelineData("pipeline_files",
                                  datastore=blob_ds).as_dataset()

    # Pipeline parameters to use with every run
    is_debug = PipelineParameter("is_debug", default_value=False)
    relay_connection_name = PipelineParameter("debug_relay_connection_name",
                                              default_value="none")

    single_step_config = RunConfiguration()
    single_step_config.environment = batch_env
    single_step = PythonScriptStep(
        name=f"single-step",
        script_name="samples/azure_ml_advanced/steps/single_step.py",
        source_directory=".",
        runconfig=single_step_config,
        arguments=[
            "--pipeline-files", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5678, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        inputs=[],
        outputs=[pipeline_files],
        compute_target=aml_compute,
        allow_reuse=False)

    output_dir = PipelineData("output_dir")

    parallel_run_config = ParallelRunConfig(
        entry_script="samples/azure_ml_advanced/steps/parallel_step.py",
        source_directory=".",
        mini_batch_size="5",
        output_action="summary_only",
        environment=batch_env,
        compute_target=aml_compute,
        error_threshold=10,
        run_invocation_timeout=600,  # very important for debugging
        node_count=2,
        process_count_per_node=1)

    parallelrun_step = ParallelRunStep(
        name="parallel-run-step",
        parallel_run_config=parallel_run_config,
        inputs=[pipeline_files],
        output=output_dir,
        arguments=[
            "--is-debug", is_debug, "--debug-relay-connection-name",
            relay_connection_name, "--debug-port", 5679,
            "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        allow_reuse=False)

    parallelrun_step.run_after(single_step)

    distr_config = MpiConfiguration(process_count_per_node=1, node_count=2)

    src = ScriptRunConfig(
        source_directory=".",
        script="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=compute_name,
        environment=tf_env,
        distributed_job_config=distr_config,
    )

    mpi_step = PythonScriptStep(
        name="mpi-step",
        script_name="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=aml_compute,
        inputs=[pipeline_files],
        outputs=[],
        runconfig=src.run_config,
        source_directory=".")

    mpi_step.run_after(parallelrun_step)

    print("Pipeline Steps Created")

    steps = [single_step, parallelrun_step, mpi_step]

    print(f"Returning {len(steps)} steps")
    return steps
def get_backtest_pipeline(
    experiment: Experiment,
    dataset: TabularDataset,
    process_per_node: int,
    node_count: int,
    compute_target: ComputeTarget,
    automl_settings: Dict[str, Any],
    step_size: int,
    step_number: int,
    model_name: Optional[str] = None,
    model_uid: Optional[str] = None,
) -> Pipeline:
    """
    :param experiment: The experiment used to run the pipeline.
    :param dataset: Tabular data set to be used for model training.
    :param process_per_node: The number of processes per node. Generally it should be the number of cores
                             on the node divided by two.
    :param node_count: The number of nodes to be used.
    :param compute_target: The compute target to be used to run the pipeline.
    :param model_name: The name of a model to be back tested.
    :param automl_settings: The dictionary with automl settings.
    :param step_size: The number of periods to step back in backtesting.
    :param step_number: The number of backtesting iterations.
    :param model_uid: The uid to mark models from this run of the experiment.
    :return: The pipeline to be used for model retraining.
             **Note:** The output will be uploaded in the pipeline output
             called 'score'.
    """
    jasmine_client = JasmineClient(
        service_context=experiment.workspace.service_context,
        experiment_name=experiment.name,
        experiment_id=experiment.id,
    )
    env = jasmine_client.get_curated_environment(
        scenario=Scenarios.AUTOML,
        enable_dnn=False,
        enable_gpu=False,
        compute=compute_target,
        compute_sku=experiment.workspace.compute_targets.get(
            compute_target.name
        ).vm_size,
    )
    data_results = PipelineData(
        name="results", datastore=None, pipeline_output_name="results"
    )
    ############################################################
    # Split the data set using python script.
    ############################################################
    run_config = RunConfiguration()
    run_config.docker.use_docker = True
    run_config.environment = env

    utilities.set_environment_variables_for_run(run_config)

    split_data = PipelineData(name="split_data_output", datastore=None).as_dataset()
    split_step = PythonScriptStep(
        name="split_data_for_backtest",
        script_name="data_split.py",
        inputs=[dataset.as_named_input("training_data")],
        outputs=[split_data],
        source_directory=PROJECT_FOLDER,
        arguments=[
            "--step-size",
            step_size,
            "--step-number",
            step_number,
            "--time-column-name",
            automl_settings.get("time_column_name"),
            "--time-series-id-column-names",
            automl_settings.get("grain_column_names"),
            "--output-dir",
            split_data,
        ],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    ############################################################
    # We will do the backtest the parallel run step.
    ############################################################
    settings_path = os.path.join(PROJECT_FOLDER, SETTINGS_FILE)
    hru.dump_object_to_json(automl_settings, settings_path)
    mini_batch_size = PipelineParameter(name="batch_size_param", default_value=str(1))
    back_test_config = ParallelRunConfig(
        source_directory=PROJECT_FOLDER,
        entry_script="retrain_models.py",
        mini_batch_size=mini_batch_size,
        error_threshold=-1,
        output_action="append_row",
        append_row_file_name="outputs.txt",
        compute_target=compute_target,
        environment=env,
        process_count_per_node=process_per_node,
        run_invocation_timeout=3600,
        node_count=node_count,
    )
    utilities.set_environment_variables_for_run(back_test_config)
    forecasts = PipelineData(name="forecasts", datastore=None)
    if model_name:
        parallel_step_name = "{}-backtest".format(model_name.replace("_", "-"))
    else:
        parallel_step_name = "AutoML-backtest"

    prs_args = [
        "--target_column_name",
        automl_settings.get("label_column_name"),
        "--output-dir",
        forecasts,
    ]
    if model_name is not None:
        prs_args.append("--model-name")
        prs_args.append(model_name)
    if model_uid is not None:
        prs_args.append("--model-uid")
        prs_args.append(model_uid)
    backtest_prs = ParallelRunStep(
        name=parallel_step_name,
        parallel_run_config=back_test_config,
        arguments=prs_args,
        inputs=[split_data],
        output=forecasts,
        allow_reuse=False,
    )
    ############################################################
    # Then we collect the output and return it as scores output.
    ############################################################
    collection_step = PythonScriptStep(
        name="score",
        script_name="score.py",
        inputs=[forecasts.as_mount()],
        outputs=[data_results],
        source_directory=PROJECT_FOLDER,
        arguments=["--forecasts", forecasts, "--output-dir", data_results],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    # Build and return the pipeline.
    return Pipeline(
        workspace=experiment.workspace,
        steps=[split_step, backtest_prs, collection_step],
    )
Esempio n. 10
0
from azureml.core.runconfig import DEFAULT_GPU_IMAGE

cd = CondaDependencies.create(pip_packages=[
    "tensorflow-gpu==1.15.2", "azureml-core", "azureml-dataprep[fuse]"
])
env = Environment(name="parallelenv")
env.python.conda_dependencies = cd
env.docker.base_image = DEFAULT_GPU_IMAGE

from azureml.pipeline.steps import ParallelRunConfig

parallel_run_config = ParallelRunConfig(environment=env,
                                        entry_script="batch_scoring.py",
                                        source_directory="scripts",
                                        output_action="append_row",
                                        mini_batch_size="20",
                                        error_threshold=1,
                                        compute_target=compute_target,
                                        process_count_per_node=2,
                                        node_count=1)

from azureml.pipeline.steps import ParallelRunStep
from datetime import datetime

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

label_config = label_ds.as_named_input("labels_input")

batch_score_step = ParallelRunStep(
    name=parallel_step_name,
    inputs=[input_images.as_named_input("input_images")],
Esempio n. 11
0
else:
    runId = run.parent.id

dataset = Dataset.File.from_files(
    path=[(mydatastore, f"rawdata/daystoprocess/{runId}/*.csv")])

env = Environment(name="parallelenv")

env.from_conda_specification('parallelenv', './DataIngest/parallelenv.yml')

parallel_run_config = ParallelRunConfig(
    source_directory='.',
    entry_script='./DataIngest/parallelrunstep.py',
    mini_batch_size="1",
    error_threshold=30,
    output_action="append_row",
    environment=env,
    compute_target='cpu-cluster',
    append_row_file_name="my_outputs.txt",
    run_invocation_timeout=1200,
    node_count=1)

parallelrun_step = ParallelRunStep(
    name="parallelapicalls",
    parallel_run_config=parallel_run_config,
    arguments=["--arg1", string_pipeline_param],
    inputs=[dataset.as_named_input("inputds")],
    output=output_dir
    #models=[ model ] #not needed as its only relevant in batch inferencing
    #arguments=[ ],
    #allow_reuse=True
Esempio n. 12
0
# Get the batch dataset for input
batch_data_set = ws.datasets['batch-data']

# Set the output location
default_ds = ws.get_default_datastore()
output_dir = PipelineData(name='inferences',
                          datastore=default_ds,
                          output_path_on_compute='results')

# Define the parallel run step step configuration
parallel_run_config = ParallelRunConfig(
    source_directory='batch_scripts',
    entry_script="batch_scoring_script.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=aml_cluster,
    node_count=4)

# Create the parallel run step
parallelrun_step = ParallelRunStep(
    name = 'batch-score',
    parallel_run_config = parallel_run_config,
    inputs = [batch_data_set.as_named_input('batch_data')],
    output = output_dir,
    arguments = [],
    allow_reuse = True
)
Esempio n. 13
0
batch_env = Environment(name="batch_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_GPU_IMAGE

from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.steps import ParallelRunConfig

parallel_run_config = ParallelRunConfig(
    source_directory='',
    entry_script="batchscore3.py",
    mini_batch_size=PipelineParameter(name="batch_size_param",
                                      default_value="5"),
    error_threshold=10,
    output_action="append_row",
    append_row_file_name="mnist_outputs.txt",
    environment=batch_env,
    compute_target=compute_target,
    process_count_per_node=PipelineParameter(name="process_count_param",
                                             default_value=2),
    node_count=2)

from azureml.pipeline.steps import ParallelRunStep
from datetime import datetime

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

from azureml.pipeline.steps import ParallelRunStep

parallelrun_step = ParallelRunStep(name=parallel_step_name,
Esempio n. 14
0
# Add dependancies
aml_run_config.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas', 'scikit-learn'],
    pip_packages=[
        'azureml-sdk', 'azureml-dataprep[fuse, pandas]',
        'azureml-dataset-runtime[pandas, fuse]', 'tensorflow', 'keras',
        'textblob', 'nltk', 'fuzzywuzzy', 'azureml-defaults', 'azureml-core'
    ],
    pin_sdk_version=False)

parallel_run_config = ParallelRunConfig(
    source_directory='./',
    entry_script='skill_recommender_AML.py',
    mini_batch_size='5KB',
    error_threshold=-1,
    output_action='append_row',
    environment=aml_run_config,
    comput_target=comput_target,
    process_count_per_node=PipelineParameter(name='process_count_param',
                                             default_value=2),
    node_count=2,
    run_invocation_timeout=600)

parallelrun_step = ParallelRunStep(name='skill-extractor-parallel',
                                   parallel_run_config=parallel_run_config,
                                   inputs=[named_emp_ds],
                                   output=output_dir,
                                   allow_reuse=True)

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
experiment = Experiment(ws, 'skill-extractor-parallel')
pipeline_run = experiment.submit(pipeline)