def create_run_config(azure_config: AzureConfig, source_config: SourceConfig, all_azure_dataset_ids: List[str], all_dataset_mountpoints: List[str], environment_name: str = "") -> ScriptRunConfig: """ Creates a configuration to run the InnerEye training script in AzureML. :param azure_config: azure related configurations to use for model scale-out behaviour :param source_config: configurations for model execution, such as name and execution mode :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run. :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points. :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used when running inference for an existing model. :return: The configured script run. """ dataset_consumptions = create_dataset_consumptions( azure_config, all_azure_dataset_ids, all_dataset_mountpoints) # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path entry_script_relative_path = source_config.entry_script.relative_to( source_config.root_folder).as_posix() logging.info( f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to " f"source directory {source_config.root_folder})") max_run_duration = None if azure_config.max_run_duration: max_run_duration = run_duration_string_to_seconds( azure_config.max_run_duration) workspace = azure_config.get_workspace() run_config = RunConfiguration( script=entry_script_relative_path, arguments=source_config.script_params, ) run_config.environment = get_or_create_python_environment( azure_config, source_config, environment_name=environment_name) run_config.target = azure_config.cluster run_config.max_run_duration_seconds = max_run_duration if azure_config.num_nodes > 1: distributed_job_config = MpiConfiguration( node_count=azure_config.num_nodes) run_config.mpi = distributed_job_config run_config.framework = "Python" run_config.communicator = "IntelMpi" run_config.node_count = distributed_job_config.node_count if len(dataset_consumptions) > 0: run_config.data = { dataset.name: dataset for dataset in dataset_consumptions } # Use blob storage for storing the source, rather than the FileShares section of the storage account. run_config.source_directory_data_store = workspace.datastores.get( WORKSPACE_DEFAULT_BLOB_STORE_NAME).name script_run_config = ScriptRunConfig( source_directory=str(source_config.root_folder), run_config=run_config, ) if azure_config.hyperdrive: script_run_config = source_config.hyperdrive_config_func( script_run_config) # type: ignore return script_run_config
def create_run_config(azure_config: AzureConfig, source_config: SourceConfig, azure_dataset_id: str = "", environment_name: str = "") -> ScriptRunConfig: """ Creates a configuration to run the InnerEye training script in AzureML. :param azure_config: azure related configurations to use for model scale-out behaviour :param source_config: configurations for model execution, such as name and execution mode :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty string to not use any datasets. :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used when running inference for an existing model. :return: The configured script run. """ if azure_dataset_id: azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id) if not azureml_dataset: raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.") named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY) dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download() else: dataset_consumption = None # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix() logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to " f"source directory {source_config.root_folder})") max_run_duration = None if azure_config.max_run_duration: max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration) workspace = azure_config.get_workspace() run_config = RunConfiguration( script=entry_script_relative_path, arguments=source_config.script_params, ) run_config.environment = get_or_create_python_environment(azure_config, source_config, environment_name=environment_name) run_config.target = azure_config.cluster run_config.max_run_duration_seconds = max_run_duration if azure_config.num_nodes > 1: distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes) run_config.mpi = distributed_job_config run_config.framework = "Python" run_config.communicator = "IntelMpi" run_config.node_count = distributed_job_config.node_count if dataset_consumption: run_config.data = {dataset_consumption.name: dataset_consumption} # Use blob storage for storing the source, rather than the FileShares section of the storage account. run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name script_run_config = ScriptRunConfig( source_directory=str(source_config.root_folder), run_config=run_config, ) if azure_config.hyperdrive: script_run_config = source_config.hyperdrive_config_func(script_run_config) # type: ignore return script_run_config
# Build CPU image for Ray ray_cpu_env = Environment.from_dockerfile( name=ray_environment_name, dockerfile=ray_environment_dockerfile_path) ray_cpu_env.register(workspace=ws) ray_cpu_build_details = ray_cpu_env.build(workspace=ws) while ray_cpu_build_details.status not in ["Succeeded", "Failed"]: print( f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}" ) time.sleep(10) command = ["python distribute_automl.py"] env = Environment.get(workspace=ws, name=ray_environment_name) compute_target = ws.compute_targets["cpucluster"] aml_run_config = RunConfiguration(communicator="OpenMpi") aml_run_config.target = compute_target aml_run_config.docker = DockerConfiguration(use_docker=True) aml_run_config.environment = env aml_run_config.node_count = 2 config = ScriptRunConfig( source_directory="ray/", command=command, run_config=aml_run_config, ) exp = Experiment(ws, "distribute-automl") run = exp.submit(config) print(run.get_portal_url()) # link to ml.azure.com run.wait_for_completion(show_output=True)
except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', min_nodes=0, max_nodes=6) compute_target = ComputeTarget.create(ws, compute_name, compute_config) compute_target.wait_for_completion(show_output=True) run_conf.target = compute_target run_conf.environment.docker.enabled = True run_conf.environment.docker.base_image = DEFAULT_CPU_IMAGE run_conf.environment.python.conda_dependencies = \ CondaDependencies(conda_dependencies_file_path='env.yml') run_conf.environment.python.user_managed_dependencies = False if cv: run_conf.communicator = 'OpenMPI' run_conf.mpi = MpiConfiguration() run_conf.node_count = cv + 2 exp = Experiment(workspace=ws, name=config['experiment_name']) use_estimator = True if use_estimator: if cv: script_params = {'--cv': cv} node_count = cv + 2 # dask-mpi uses 2 nodes for its scheduler and client distributed_training = MpiConfiguration() else: script_params = None node_count = None distributed_training = None to_run = Estimator(source_directory='.', compute_target=compute_target, entry_script='train.py',
myenv.docker.base_image = "mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04" # comment out this environment variable if you don't have it set! myenv.environment_variables = {'WANDB_API_KEY': os.environ['WANDB_API_KEY']} ## Environment: python section conda_dep = CondaDependencies(conda_dependencies_file_path='environment.yml') myenv.python.conda_dependencies = conda_dep # create configuration for Run # Use RunConfiguration to specify compute target / env deps part of run run_config = RunConfiguration() # Attach compute target to run config run_config.framework = 'python' run_config.target = "gpu-cluster" # This doesn't actuallly do anything since my target is a persistent compute instead of amlcompute run_config.amlcompute.vm_size = "Standard_NC24" run_config.node_count = 1 run_config.environment = myenv # ScriptRunConfig packaages together environment configuration of # RunConfiguration with a script for training to create a **script** run. """ RunConfiguration(script=None, arguments=None, framework=None, communicator=None, conda_dependencies=None, _history_enabled=None, _path=None, _name=None) """ src = ScriptRunConfig(source_directory='.', script='rl_credit/examples/distractor_delay_expt.py', run_config=run_config) get_run_config_from_script_run(src).save(path='.azureml/train.runconfig')