Beispiel #1
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      all_azure_dataset_ids: List[str],
                      all_dataset_mountpoints: List[str],
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    dataset_consumptions = create_dataset_consumptions(
        azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(
        azure_config, source_config, environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(
            node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if len(dataset_consumptions) > 0:
        run_config.data = {
            dataset.name: dataset
            for dataset in dataset_consumptions
        }
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(
        WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(
            script_run_config)  # type: ignore
    return script_run_config
Beispiel #2
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      azure_dataset_id: str = "",
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
    string to not use any datasets.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    if azure_dataset_id:
        azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
        if not azureml_dataset:
            raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
        named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
        dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
    else:
        dataset_consumption = None
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
    logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
                 f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(azure_config, source_config,
                                                              environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if dataset_consumption:
        run_config.data = {dataset_consumption.name: dataset_consumption}
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(script_run_config)  # type: ignore
    return script_run_config
Beispiel #3
0
# Build CPU image for Ray
ray_cpu_env = Environment.from_dockerfile(
    name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
ray_cpu_env.register(workspace=ws)
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)

while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
    print(
        f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}"
    )
    time.sleep(10)

command = ["python distribute_automl.py"]
env = Environment.get(workspace=ws, name=ray_environment_name)
compute_target = ws.compute_targets["cpucluster"]
aml_run_config = RunConfiguration(communicator="OpenMpi")
aml_run_config.target = compute_target
aml_run_config.docker = DockerConfiguration(use_docker=True)
aml_run_config.environment = env
aml_run_config.node_count = 2
config = ScriptRunConfig(
    source_directory="ray/",
    command=command,
    run_config=aml_run_config,
)

exp = Experiment(ws, "distribute-automl")
run = exp.submit(config)
print(run.get_portal_url())  # link to ml.azure.com
run.wait_for_completion(show_output=True)
Beispiel #4
0
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6', min_nodes=0, max_nodes=6)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    run_conf.target = compute_target
    run_conf.environment.docker.enabled = True
    run_conf.environment.docker.base_image = DEFAULT_CPU_IMAGE
    run_conf.environment.python.conda_dependencies = \
        CondaDependencies(conda_dependencies_file_path='env.yml')
    run_conf.environment.python.user_managed_dependencies = False
    if cv:
        run_conf.communicator = 'OpenMPI'
        run_conf.mpi = MpiConfiguration()
        run_conf.node_count = cv + 2
exp = Experiment(workspace=ws, name=config['experiment_name'])

use_estimator = True
if use_estimator:
    if cv:
        script_params = {'--cv': cv}
        node_count = cv + 2  # dask-mpi uses 2 nodes for its scheduler and client
        distributed_training = MpiConfiguration()
    else:
        script_params = None
        node_count = None
        distributed_training = None
    to_run = Estimator(source_directory='.',
                       compute_target=compute_target,
                       entry_script='train.py',
myenv.docker.base_image = "mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
# comment out this environment variable if you don't have it set!
myenv.environment_variables = {'WANDB_API_KEY': os.environ['WANDB_API_KEY']}

## Environment: python section
conda_dep = CondaDependencies(conda_dependencies_file_path='environment.yml')
myenv.python.conda_dependencies = conda_dep

# create configuration for Run
# Use RunConfiguration to specify compute target / env deps part of run
run_config = RunConfiguration()

# Attach compute target to run config
run_config.framework = 'python'
run_config.target = "gpu-cluster"
# This doesn't actuallly do anything since my target is a persistent compute instead of amlcompute
run_config.amlcompute.vm_size = "Standard_NC24"
run_config.node_count = 1
run_config.environment = myenv

# ScriptRunConfig packaages together environment configuration of
# RunConfiguration with a script for training to create a **script** run.
""" RunConfiguration(script=None, arguments=None, framework=None, communicator=None,
 conda_dependencies=None, _history_enabled=None, _path=None, _name=None)
"""
src = ScriptRunConfig(source_directory='.',
                      script='rl_credit/examples/distractor_delay_expt.py',
                      run_config=run_config)

get_run_config_from_script_run(src).save(path='.azureml/train.runconfig')