Esempio n. 1
0
def test_experiment_name() -> None:
    c = AzureConfig()
    c.build_branch = "branch"
    c.get_git_information()
    assert create_experiment_name(c) == "branch"
    c.experiment_name = "foo"
    assert create_experiment_name(c) == "foo"
    def submit_to_azureml_if_needed(self) -> AzureRunInfo:
        """
        Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
        completion and the Run did not succeed.
        """
        if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \
                and not self.lightning_container.azure_dataset_id:
            raise ValueError(
                "When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
                "property must be set.")
        # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
        env_variables = {
            "CUBLAS_WORKSPACE_CONFIG": ":4096:8"
        } if self.lightning_container.pl_deterministic else {}
        source_config = SourceConfig(
            root_folder=self.project_root,
            entry_script=Path(sys.argv[0]).resolve(),
            script_params=sys.argv[1:],
            conda_dependencies_files=get_all_environment_files(
                self.project_root),
            hyperdrive_config_func=(
                self.model_config.get_hyperdrive_config if self.model_config
                else self.lightning_container.get_hyperdrive_config),
            # For large jobs, upload of results can time out because of large checkpoint files. Default is 600
            upload_timeout_seconds=86400,
            environment_variables=env_variables)
        # Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
        # when running pytest.
        ignored_folders = []
        if not self.azure_config.pytest_mark:
            ignored_folders.extend(["Tests", "TestsOutsidePackage"])
        if not self.lightning_container.regression_test_folder:
            ignored_folders.append("RegressionTestResults")

        all_local_datasets = self.lightning_container.all_local_dataset_paths()
        input_datasets = \
            create_dataset_configs(self.azure_config,
                                   all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(),
                                   all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints(),
                                   all_local_datasets=all_local_datasets)  # type: ignore

        def after_submission_hook(azure_run: Run) -> None:
            """
            A function that will be called right after job submission.
            """
            # Set the default display name to what was provided as the "tag". This will affect single runs
            # and Hyperdrive parent runs
            if self.azure_config.tag:
                azure_run.display_name = self.azure_config.tag
            # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
            # run in cross validation analysis
            recovery_id = create_run_recovery_id(azure_run)
            azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
            print(
                "If this run fails, re-start runner.py and supply these additional arguments: "
                f"--run_recovery_id={recovery_id}")
            if self.azure_config.tensorboard:
                print(
                    "Starting TensorBoard now because you specified --tensorboard"
                )
                monitor(monitor_config=AMLTensorBoardMonitorConfig(
                    run_ids=[azure_run.id]),
                        azure_config=self.azure_config)
            else:
                print(
                    f"To monitor this run locally using TensorBoard, run the script: "
                    f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}"
                )

            if self.azure_config.wait_for_completion:
                # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
                # may need to download the pytest result file.
                azure_run.wait_for_completion(show_output=True,
                                              raise_on_error=False)
                if self.azure_config.pytest_mark:
                    # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
                    # A build step will pick up that file and publish it to Azure DevOps.
                    # If pytest_mark is set, this file must exist.
                    logging.info("Downloading pytest result file.")
                    download_pytest_result(azure_run)
                if azure_run.status == RunStatus.FAILED:
                    raise ValueError(
                        f"The AzureML run failed. Please check this URL for details: "
                        f"{azure_run.get_portal_url()}")

        hyperdrive_config = None
        if self.azure_config.hyperdrive:
            hyperdrive_config = self.lightning_container.get_hyperdrive_config(
                ScriptRunConfig(source_directory=""))

        # Create a temporary file for the merged conda file, that will be removed after submission of the job.
        temp_conda: Optional[Path] = None
        try:
            if len(source_config.conda_dependencies_files) > 1:
                temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
                # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not
                # be necessary if the innereye package is installed. It is necessary when working with an outer project
                # and InnerEye as a git submodule and submitting jobs from the local machine.
                # In case of version conflicts, the package version in the outer project is given priority.
                merge_conda_files(source_config.conda_dependencies_files,
                                  temp_conda)

            # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so
            # we should only attempt them if we intend to elevate this to AzureML
            if self.azure_config.azureml:
                if not self.azure_config.cluster:
                    raise ValueError(
                        "self.azure_config.cluster not set, but we need a compute_cluster_name to submit"
                        "the script to run in AzureML")
                azure_run_info = submit_to_azure_if_needed(
                    entry_script=source_config.entry_script,
                    snapshot_root_directory=source_config.root_folder,
                    script_params=source_config.script_params,
                    conda_environment_file=temp_conda
                    or source_config.conda_dependencies_files[0],
                    aml_workspace=self.azure_config.get_workspace(),
                    compute_cluster_name=self.azure_config.cluster,
                    environment_variables=source_config.environment_variables,
                    default_datastore=self.azure_config.azureml_datastore,
                    experiment_name=to_azure_friendly_string(
                        create_experiment_name(self.azure_config)),
                    max_run_duration=self.azure_config.max_run_duration,
                    input_datasets=input_datasets,
                    num_nodes=self.azure_config.num_nodes,
                    wait_for_completion=False,
                    ignored_folders=ignored_folders,
                    pip_extra_index_url=self.azure_config.pip_extra_index_url,
                    submit_to_azureml=self.azure_config.azureml,
                    docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
                    docker_shm_size=self.azure_config.docker_shm_size,
                    tags=additional_run_tags(azure_config=self.azure_config,
                                             commandline_args=" ".join(
                                                 source_config.script_params)),
                    after_submission=after_submission_hook,
                    hyperdrive_config=hyperdrive_config)
                if self.azure_config.tag and azure_run_info.run:
                    if self.lightning_container.perform_cross_validation:
                        # This code is only reached inside Azure. Set display name again - this will now affect
                        # Hypdrive child runs (for other jobs, this has already been done after submission)
                        cv_index = self.lightning_container.cross_validation_split_index
                        full_display_name = f"{self.azure_config.tag} {cv_index}"
                        azure_run_info.run.display_name = full_display_name
            else:
                azure_run_info = submit_to_azure_if_needed(
                    input_datasets=input_datasets, submit_to_azureml=False)
        finally:
            if temp_conda:
                temp_conda.unlink()
        # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
        # the script locally or in AzureML.
        return azure_run_info