def test_experiment_name() -> None: c = AzureConfig() c.build_branch = "branch" c.get_git_information() assert create_experiment_name(c) == "branch" c.experiment_name = "foo" assert create_experiment_name(c) == "foo"
def submit_to_azureml_if_needed(self) -> AzureRunInfo: """ Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for completion and the Run did not succeed. """ if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \ and not self.lightning_container.azure_dataset_id: raise ValueError( "When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' " "property must be set.") # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility env_variables = { "CUBLAS_WORKSPACE_CONFIG": ":4096:8" } if self.lightning_container.pl_deterministic else {} source_config = SourceConfig( root_folder=self.project_root, entry_script=Path(sys.argv[0]).resolve(), script_params=sys.argv[1:], conda_dependencies_files=get_all_environment_files( self.project_root), hyperdrive_config_func=( self.model_config.get_hyperdrive_config if self.model_config else self.lightning_container.get_hyperdrive_config), # For large jobs, upload of results can time out because of large checkpoint files. Default is 600 upload_timeout_seconds=86400, environment_variables=env_variables) # Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed # when running pytest. ignored_folders = [] if not self.azure_config.pytest_mark: ignored_folders.extend(["Tests", "TestsOutsidePackage"]) if not self.lightning_container.regression_test_folder: ignored_folders.append("RegressionTestResults") all_local_datasets = self.lightning_container.all_local_dataset_paths() input_datasets = \ create_dataset_configs(self.azure_config, all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(), all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints(), all_local_datasets=all_local_datasets) # type: ignore def after_submission_hook(azure_run: Run) -> None: """ A function that will be called right after job submission. """ # Set the default display name to what was provided as the "tag". This will affect single runs # and Hyperdrive parent runs if self.azure_config.tag: azure_run.display_name = self.azure_config.tag # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering # run in cross validation analysis recovery_id = create_run_recovery_id(azure_run) azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") if self.azure_config.tensorboard: print( "Starting TensorBoard now because you specified --tensorboard" ) monitor(monitor_config=AMLTensorBoardMonitorConfig( run_ids=[azure_run.id]), azure_config=self.azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}" ) if self.azure_config.wait_for_completion: # We want the job output to be visible on the console. Do not exit yet if the job fails, because we # may need to download the pytest result file. azure_run.wait_for_completion(show_output=True, raise_on_error=False) if self.azure_config.pytest_mark: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) if azure_run.status == RunStatus.FAILED: raise ValueError( f"The AzureML run failed. Please check this URL for details: " f"{azure_run.get_portal_url()}") hyperdrive_config = None if self.azure_config.hyperdrive: hyperdrive_config = self.lightning_container.get_hyperdrive_config( ScriptRunConfig(source_directory="")) # Create a temporary file for the merged conda file, that will be removed after submission of the job. temp_conda: Optional[Path] = None try: if len(source_config.conda_dependencies_files) > 1: temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml" # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not # be necessary if the innereye package is installed. It is necessary when working with an outer project # and InnerEye as a git submodule and submitting jobs from the local machine. # In case of version conflicts, the package version in the outer project is given priority. merge_conda_files(source_config.conda_dependencies_files, temp_conda) # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so # we should only attempt them if we intend to elevate this to AzureML if self.azure_config.azureml: if not self.azure_config.cluster: raise ValueError( "self.azure_config.cluster not set, but we need a compute_cluster_name to submit" "the script to run in AzureML") azure_run_info = submit_to_azure_if_needed( entry_script=source_config.entry_script, snapshot_root_directory=source_config.root_folder, script_params=source_config.script_params, conda_environment_file=temp_conda or source_config.conda_dependencies_files[0], aml_workspace=self.azure_config.get_workspace(), compute_cluster_name=self.azure_config.cluster, environment_variables=source_config.environment_variables, default_datastore=self.azure_config.azureml_datastore, experiment_name=to_azure_friendly_string( create_experiment_name(self.azure_config)), max_run_duration=self.azure_config.max_run_duration, input_datasets=input_datasets, num_nodes=self.azure_config.num_nodes, wait_for_completion=False, ignored_folders=ignored_folders, pip_extra_index_url=self.azure_config.pip_extra_index_url, submit_to_azureml=self.azure_config.azureml, docker_base_image=DEFAULT_DOCKER_BASE_IMAGE, docker_shm_size=self.azure_config.docker_shm_size, tags=additional_run_tags(azure_config=self.azure_config, commandline_args=" ".join( source_config.script_params)), after_submission=after_submission_hook, hyperdrive_config=hyperdrive_config) if self.azure_config.tag and azure_run_info.run: if self.lightning_container.perform_cross_validation: # This code is only reached inside Azure. Set display name again - this will now affect # Hypdrive child runs (for other jobs, this has already been done after submission) cv_index = self.lightning_container.cross_validation_split_index full_display_name = f"{self.azure_config.tag} {cv_index}" azure_run_info.run.display_name = full_display_name else: azure_run_info = submit_to_azure_if_needed( input_datasets=input_datasets, submit_to_azureml=False) finally: if temp_conda: temp_conda.unlink() # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running # the script locally or in AzureML. return azure_run_info