def test_innereyecontainer_setup_passes_on_allow_incomplete_labels( test_output_dirs: OutputFolderForTests, allow_partial_ground_truth: bool) -> None: """ Test that InnerEyeContainer.setup passes on the correct value of allow_incomplete_labels to full_image_dataset.convert_channels_to_file_paths :param test_output_dirs: Test fixture. :param allow_partial_ground_truth: The value to set allow_incomplete_labels to and check it is passed through. """ config = DummyModel() config.set_output_to(test_output_dirs.root_dir) config.allow_incomplete_labels = allow_partial_ground_truth container = InnerEyeContainer(config) def mocked_convert_channels_to_file_paths( _: List[str], __: pd.DataFrame, ___: Path, ____: str, allow_incomplete_labels: bool) -> Tuple[List[Optional[Path]], str]: paths: List[Optional[Path]] = [] failed_channel_info = '' assert allow_incomplete_labels == allow_partial_ground_truth return paths, failed_channel_info with mock.patch("InnerEye.ML.lightning_base.convert_channels_to_file_paths" ) as convert_channels_to_file_paths_mock: convert_channels_to_file_paths_mock.side_effect = mocked_convert_channels_to_file_paths container.setup() convert_channels_to_file_paths_mock.assert_called()
def parse_and_load_model(self) -> ParserResult: """ Parses the command line arguments, and creates configuration objects for the model itself, and for the Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the parser output from parsing the model commandline arguments. If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return value is None. """ # Create a parser that will understand only the args we need for an AzureConfig parser1 = create_runner_parser() parser_result = parse_args_and_add_yaml_variables(parser1, yaml_config_file=self.yaml_config_file, project_root=self.project_root, fail_on_unknown_args=False) azure_config = AzureConfig(**parser_result.args) azure_config.project_root = self.project_root self.azure_config = azure_config self.model_config = None if not azure_config.model: raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.") model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args) # Create the model as per the "model" commandline option. This can return either a built-in config # of type DeepLearningConfig, or a LightningContainer. config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model) def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult: assert isinstance(c, GenericConfig) parser = type(c).create_argparser() # For each parser, feed in the unknown settings from the previous parser. All commandline args should # be consumed by name, hence fail if there is something that is still unknown. parser_result = parse_arguments(parser, settings_from_yaml=previous_parser_result.unknown_settings_from_yaml, args=previous_parser_result.unknown, fail_on_unknown_args=True) # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline. c.apply_overrides(parser_result.known_settings_from_yaml) c.apply_overrides(parser_result.overrides) c.validate() return parser_result # Now create a parser that understands overrides at model/container level. parser_result = parse_overrides_and_apply(config_or_container, parser_result) if isinstance(config_or_container, LightningContainer): self.lightning_container = config_or_container elif isinstance(config_or_container, ModelConfigBase): # Built-in InnerEye models use a fake container self.model_config = config_or_container self.lightning_container = InnerEyeContainer(config_or_container) else: raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}") if azure_config.extra_code_directory: exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist" logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}") else: logging.info("extra_code_directory is unset") return parser_result
def get_default_checkpoint_handler(model_config: DeepLearningConfig, project_root: Path) -> CheckpointHandler: """ Gets a checkpoint handler, using the given model config and the default azure configuration. """ azure_config = get_default_azure_config() lightning_container = InnerEyeContainer(model_config) return CheckpointHandler(azure_config=azure_config, container=lightning_container, project_root=project_root)
def test_model_name_for_innereye_container() -> None: """ Test if the InnerEye container picks up the name of the model correctly. The name will impact the output folder structure that is created. """ expected_name = "DummyModel" model = DummyModel() assert model.model_name == expected_name container = InnerEyeContainer(model) assert container.model_name == expected_name
def test_copied_properties() -> None: config = ModelConfigBase(should_validate=False) # This field lives in DatasetParams config.azure_dataset_id = "foo" # This field lives in WorkflowParams config.number_of_cross_validation_splits = 5 assert config.perform_cross_validation container = InnerEyeContainer(config) assert container.azure_dataset_id == "foo" assert container.perform_cross_validation
def test_innereye_container_init() -> None: """ Test if the constructor of the InnerEye container copies attributes as expected. """ # The constructor should copy all fields that belong to either WorkflowParams or DatasetParams from the # config object to the container. for (attrib, type_) in [("weights_url", WorkflowParams), ("extra_dataset_mountpoints", DatasetParams)]: config = ModelConfigBase(should_validate=False) assert hasattr(type_, attrib) assert hasattr(config, attrib) setattr(config, attrib, ["foo"]) container = InnerEyeContainer(config) assert getattr(container, attrib) == ["foo"]
def test_file_system_with_subfolders( test_output_dirs: OutputFolderForTests) -> None: """ Test if a subfolder can be created within the output folder structure, for use with cross validation. """ model = DummyModel() model.set_output_to(test_output_dirs.root_dir) container = InnerEyeContainer(model) # File system should be copied from model config to container assert container.file_system_config == model.file_system_config runner = MLRunner(model_config=model) runner.setup() assert str(runner.container.outputs_folder).endswith(model.model_name) output_subfolder = "foo" expected_folder = runner.container.outputs_folder / output_subfolder runner = MLRunner(model_config=model, output_subfolder=output_subfolder) runner.setup() assert runner.container.outputs_folder == expected_folder
def create_model_and_store_checkpoint(config: ModelConfigBase, checkpoint_path: Path, weights_only: bool = True) -> None: """ Creates a Lightning model for the given model configuration, and stores it as a checkpoint file. If a GPU is available, the model is moved to the GPU before storing. The trainer properties `current_epoch` and `global_step` are set to fixed non-default values. :param config: The model configuration. :param checkpoint_path: The path and filename of the checkpoint file. """ container = InnerEyeContainer(config) trainer, _ = create_lightning_trainer(container) model = create_lightning_model(config) if machine_has_gpu: model = model.cuda() # type: ignore trainer.model = model # Before saving, the values for epoch and step are incremented. Save them here in such a way that we can assert # easily later. We can't mock that because otherwise the mock object would be written to disk (that fails) trainer.fit_loop.current_epoch = FIXED_EPOCH - 1 # type: ignore trainer.fit_loop.global_step = FIXED_GLOBAL_STEP - 1 # type: ignore # In PL, it is the Trainer's responsibility to save the model. Checkpoint handling refers back to the trainer # to get a save_func. Mimicking that here. trainer.save_checkpoint(checkpoint_path, weights_only=weights_only)
class Runner: """ This class contains the high-level logic to start a training run: choose a model configuration by name, submit to AzureML if needed, or otherwise start the actual training and test loop. :param project_root: The root folder that contains all of the source code that should be executed. :param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv. :param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs. The function is called with the model configuration and the path to the downloaded and merged metrics files. :param model_deployment_hook: an optional function for deploying a model in an application-specific way. If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML Model as arguments, and return an optional Path and a further object of any type. """ def __init__(self, project_root: Path, yaml_config_file: Path, post_cross_validation_hook: Optional[ PostCrossValidationHookSignature] = None, model_deployment_hook: Optional[ ModelDeploymentHookSignature] = None): self.project_root = project_root self.yaml_config_file = yaml_config_file self.post_cross_validation_hook = post_cross_validation_hook self.model_deployment_hook = model_deployment_hook # model_config and azure_config are placeholders for now, and are set properly when command line args are # parsed. self.model_config: Optional[DeepLearningConfig] = None self.azure_config: AzureConfig = AzureConfig() self.lightning_container: LightningContainer = None # type: ignore # This field stores the MLRunner object that has been created in the most recent call to the run() method. self.ml_runner: Optional[MLRunner] = None def parse_and_load_model(self) -> ParserResult: """ Parses the command line arguments, and creates configuration objects for the model itself, and for the Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the parser output from parsing the model commandline arguments. If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return value is None. """ # Create a parser that will understand only the args we need for an AzureConfig parser1 = create_runner_parser() parser_result = parse_args_and_add_yaml_variables( parser1, yaml_config_file=self.yaml_config_file, project_root=self.project_root, fail_on_unknown_args=False) azure_config = AzureConfig(**parser_result.args) azure_config.project_root = self.project_root self.azure_config = azure_config self.model_config = None if not azure_config.model: raise ValueError( "Parameter 'model' needs to be set to tell InnerEye which model to run." ) model_config_loader: ModelConfigLoader = ModelConfigLoader( **parser_result.args) # Create the model as per the "model" commandline option. This can return either a built-in config # of type DeepLearningConfig, or a LightningContainer. config_or_container = model_config_loader.create_model_config_from_name( model_name=azure_config.model) def parse_overrides_and_apply( c: object, previous_parser_result: ParserResult) -> ParserResult: assert isinstance(c, GenericConfig) parser = type(c).create_argparser() # For each parser, feed in the unknown settings from the previous parser. All commandline args should # be consumed by name, hence fail if there is something that is still unknown. parser_result = parse_arguments( parser, settings_from_yaml=previous_parser_result. unknown_settings_from_yaml, args=previous_parser_result.unknown, fail_on_unknown_args=True) # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline. c.apply_overrides(parser_result.known_settings_from_yaml) c.apply_overrides(parser_result.overrides) c.validate() return parser_result # Now create a parser that understands overrides at model/container level. parser_result = parse_overrides_and_apply(config_or_container, parser_result) if isinstance(config_or_container, LightningContainer): self.lightning_container = config_or_container elif isinstance(config_or_container, ModelConfigBase): # Built-in InnerEye models use a fake container self.model_config = config_or_container self.lightning_container = InnerEyeContainer(config_or_container) else: raise ValueError( f"Don't know how to handle a loaded configuration of type {type(config_or_container)}" ) # Allow overriding AzureConfig params from within the container. self.lightning_container.update_azure_config(self.azure_config) if azure_config.extra_code_directory: exist = "exists" if Path(azure_config.extra_code_directory).exists( ) else "does not exist" logging.info( f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}" ) else: logging.info("extra_code_directory is unset") return parser_result def run(self) -> Tuple[Optional[DeepLearningConfig], AzureRunInfo]: """ The main entry point for training and testing models from the commandline. This chooses a model to train via a commandline argument, runs training or testing, and writes all required info to disk and logs. :return: If submitting to AzureML, returns the model configuration that was used for training, including commandline overrides applied (if any). """ # Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(logging.INFO if is_local_rank_zero() else "ERROR") initialize_rpdb() user_agent.append(azure_util.INNEREYE_SDK_NAME, azure_util.INNEREYE_SDK_VERSION) self.parse_and_load_model() if self.lightning_container.perform_cross_validation: # force hyperdrive usage if performing cross validation self.azure_config.hyperdrive = True azure_run_info = self.submit_to_azureml_if_needed() self.run_in_situ(azure_run_info) if self.model_config is None: return self.lightning_container, azure_run_info return self.model_config, azure_run_info def submit_to_azureml_if_needed(self) -> AzureRunInfo: """ Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for completion and the Run did not succeed. """ if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \ and not self.lightning_container.azure_dataset_id: raise ValueError( "When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' " "property must be set.") # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility env_variables = { "CUBLAS_WORKSPACE_CONFIG": ":4096:8" } if self.lightning_container.pl_deterministic else {} source_config = SourceConfig( root_folder=self.project_root, entry_script=Path(sys.argv[0]).resolve(), script_params=sys.argv[1:], conda_dependencies_files=get_all_environment_files( self.project_root), hyperdrive_config_func=( self.model_config.get_hyperdrive_config if self.model_config else self.lightning_container.get_hyperdrive_config), # For large jobs, upload of results can time out because of large checkpoint files. Default is 600 upload_timeout_seconds=86400, environment_variables=env_variables) # Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed # when running pytest. ignored_folders = [] if not self.azure_config.pytest_mark: ignored_folders.extend(["Tests", "TestsOutsidePackage"]) if not self.lightning_container.regression_test_folder: ignored_folders.append("RegressionTestResults") all_local_datasets = self.lightning_container.all_local_dataset_paths() input_datasets = \ create_dataset_configs(self.azure_config, all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(), all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints(), all_local_datasets=all_local_datasets) # type: ignore def after_submission_hook(azure_run: Run) -> None: """ A function that will be called right after job submission. """ # Set the default display name to what was provided as the "tag". This will affect single runs # and Hyperdrive parent runs if self.azure_config.tag: azure_run.display_name = self.azure_config.tag # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering # run in cross validation analysis recovery_id = create_run_recovery_id(azure_run) azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") if self.azure_config.tensorboard: print( "Starting TensorBoard now because you specified --tensorboard" ) monitor(monitor_config=AMLTensorBoardMonitorConfig( run_ids=[azure_run.id]), azure_config=self.azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}" ) if self.azure_config.wait_for_completion: # We want the job output to be visible on the console. Do not exit yet if the job fails, because we # may need to download the pytest result file. azure_run.wait_for_completion(show_output=True, raise_on_error=False) if self.azure_config.pytest_mark: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) if azure_run.status == RunStatus.FAILED: raise ValueError( f"The AzureML run failed. Please check this URL for details: " f"{azure_run.get_portal_url()}") hyperdrive_config = None if self.azure_config.hyperdrive: hyperdrive_config = self.lightning_container.get_hyperdrive_config( ScriptRunConfig(source_directory="")) # Create a temporary file for the merged conda file, that will be removed after submission of the job. temp_conda: Optional[Path] = None try: if len(source_config.conda_dependencies_files) > 1: temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml" # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not # be necessary if the innereye package is installed. It is necessary when working with an outer project # and InnerEye as a git submodule and submitting jobs from the local machine. # In case of version conflicts, the package version in the outer project is given priority. merge_conda_files(source_config.conda_dependencies_files, temp_conda) # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so # we should only attempt them if we intend to elevate this to AzureML if self.azure_config.azureml: if not self.azure_config.cluster: raise ValueError( "self.azure_config.cluster not set, but we need a compute_cluster_name to submit" "the script to run in AzureML") azure_run_info = submit_to_azure_if_needed( entry_script=source_config.entry_script, snapshot_root_directory=source_config.root_folder, script_params=source_config.script_params, conda_environment_file=temp_conda or source_config.conda_dependencies_files[0], aml_workspace=self.azure_config.get_workspace(), compute_cluster_name=self.azure_config.cluster, environment_variables=source_config.environment_variables, default_datastore=self.azure_config.azureml_datastore, experiment_name=to_azure_friendly_string( create_experiment_name(self.azure_config)), max_run_duration=self.azure_config.max_run_duration, input_datasets=input_datasets, num_nodes=self.azure_config.num_nodes, wait_for_completion=False, ignored_folders=ignored_folders, pip_extra_index_url=self.azure_config.pip_extra_index_url, submit_to_azureml=self.azure_config.azureml, docker_base_image=DEFAULT_DOCKER_BASE_IMAGE, docker_shm_size=self.azure_config.docker_shm_size, tags=additional_run_tags(azure_config=self.azure_config, commandline_args=" ".join( source_config.script_params)), after_submission=after_submission_hook, hyperdrive_config=hyperdrive_config) if self.azure_config.tag and azure_run_info.run: if self.lightning_container.perform_cross_validation: # This code is only reached inside Azure. Set display name again - this will now affect # Hypdrive child runs (for other jobs, this has already been done after submission) cv_index = self.lightning_container.cross_validation_split_index full_display_name = f"{self.azure_config.tag} {cv_index}" azure_run_info.run.display_name = full_display_name else: azure_run_info = submit_to_azure_if_needed( input_datasets=input_datasets, submit_to_azureml=False) finally: if temp_conda: temp_conda.unlink() # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running # the script locally or in AzureML. return azure_run_info def print_git_tags(self) -> None: """ When running in AzureML, print all the tags that contain information about the git repository status, for answering the question "which code version was used" from a log file only. """ git_tags = get_git_tags(self.azure_config) if is_offline_run_context(RUN_CONTEXT): # When running on a VM outside AzureML, we can read git information from the current repository tags_to_print = git_tags else: # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git # information into run tags after submitting the job, and can read it out here. # Only print out those tags that were created from git-related information tags_to_print = { key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags } logging.info("Git repository information:") for key, value in tags_to_print.items(): logging.info(f" {key:20}: {value}") def run_in_situ(self, azure_run_info: AzureRunInfo) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. :param azure_run_info: Contains all information about the present run in AzureML, in particular where the datasets are mounted. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable logging_to_stdout( self.azure_config.log_level if is_local_rank_zero() else "ERROR") package_setup_and_hacks() if is_global_rank_zero(): self.print_git_tags() # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the # large models. if self.azure_config.pytest_mark: outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR pytest_passed, results_file_path = run_pytest( self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: # Terminate if pytest has failed. This makes the smoke test in # PR builds fail if pytest fails. pytest_failures = f"Not all PyTest tests passed. See {results_file_path}" raise ValueError(pytest_failures) else: # Set environment variables for multi-node training if needed. This function will terminate early # if it detects that it is not in a multi-node environment. set_environment_variables_for_multi_node() self.ml_runner = self.create_ml_runner() self.ml_runner.setup(azure_run_info) self.ml_runner.run() def create_ml_runner(self) -> MLRunner: """ Create and return an ML runner using the attributes of this Runner object. """ return MLRunner( model_config=self.model_config, container=self.lightning_container, azure_config=self.azure_config, project_root=self.project_root, post_cross_validation_hook=self.post_cross_validation_hook, model_deployment_hook=self.model_deployment_hook)
def test_convert_channels_to_file_paths( default_config: ModelConfigBase) -> None: """ Test unit for missing channels and missing files. """ # Sets DummyModel config and container container = InnerEyeContainer(default_config) # 1 Should not return any errors given that no channels or files are missing container.setup() # 2 Creates InnerEyeContainer object with: missing channels: "channel1", "channel2", 'mask' # for patients 1 and 4. Wrong file name for channel "mask", patient 2. Error report is generated # for patients 1,2,4. Patient 3 should not generate any error as no channel, file is missing. # Commented lines show missing and expected error reporting. data_frame = [ # ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"], # ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2", "1"], # ["1", "train_and_test_data/id1_mask.nii.gz", "mask", "1"], ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"], ["1", "train_and_test_data/id1_region.nii.gz", "region_1", "1"], ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"], ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"], ["2", "FILE_A", "mask", "2"], ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"], ["2", "train_and_test_data/id2_region.nii.gz", "region_1", "2"], ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"], ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"], ["3", "train_and_test_data/id2_mask.nii.gz", "mask", "3"], ["3", "train_and_test_data/id2_region.nii.gz", "region", "3"], ["3", "train_and_test_data/id2_region.nii.gz", "region_1", "3"], # ["4", "train_and_test_data/id2_channel1.nii.gz", "channel1", "4"], # ["4", "train_and_test_data/id2_channel1.nii.gz", "channel2", "4"], # ["4", "train_and_test_data/id2_mask.nii.gz", "mask", "4"], ["4", "train_and_test_data/id2_region.nii.gz", "region", "4"], ["4", "train_and_test_data/id2_region.nii.gz", "region_1", "4"] ] # 3 Overwrite get_model_train_test_dataset_splits method for subjects 1,2,3,4 class MyDummyModel(DummyModel): def get_model_train_test_dataset_splits( self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits( train=dataset_df[dataset_df.subject.isin(['1'])], test=dataset_df[dataset_df.subject.isin(['2', '3'])], val=dataset_df[dataset_df.subject.isin(['4'])]) config_missing_channels_and_files = MyDummyModel() data_frame_with_missing_channels_and_files = pd.DataFrame( data_frame, columns=['subject', 'filePath', 'channel', 'institutionId']) config_missing_channels_and_files._dataset_data_frame = data_frame_with_missing_channels_and_files container_missing_files_channels = InnerEyeContainer( config_missing_channels_and_files) with pytest.raises(ValueError) as e: container_missing_files_channels.setup() assert "Patient 1 does not have channel 'channel1'" in str(e.value) assert "Patient 1 does not have channel 'channel2'" in str(e.value) assert "Patient 1 does not have channel 'mask'" in str(e.value) assert "Patient 2" in str(e.value) and "FILE_A does not exist" in str( e.value) assert "Patient 3" not in str(e.value) assert "Patient 4 does not have channel 'channel1'" in str(e.value) assert "Patient 4 does not have channel 'channel2'" in str(e.value) assert "Patient 4 does not have channel 'mask'" in str(e.value)
class Runner: """ This class contains the high-level logic to start a training run: choose a model configuration by name, submit to AzureML if needed, or otherwise start the actual training and test loop. :param project_root: The root folder that contains all of the source code that should be executed. :param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv. :param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs. The function is called with the model configuration and the path to the downloaded and merged metrics files. :param model_deployment_hook: an optional function for deploying a model in an application-specific way. If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML Model as arguments, and return an optional Path and a further object of any type. :param command_line_args: command-line arguments to use; if None, use sys.argv. """ def __init__(self, project_root: Path, yaml_config_file: Path, post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None, model_deployment_hook: Optional[ModelDeploymentHookSignature] = None): self.project_root = project_root self.yaml_config_file = yaml_config_file self.post_cross_validation_hook = post_cross_validation_hook self.model_deployment_hook = model_deployment_hook # model_config and azure_config are placeholders for now, and are set properly when command line args are # parsed. self.model_config: Optional[DeepLearningConfig] = None self.azure_config: AzureConfig = AzureConfig() self.lightning_container: LightningContainer = None # type: ignore def parse_and_load_model(self) -> ParserResult: """ Parses the command line arguments, and creates configuration objects for the model itself, and for the Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the parser output from parsing the model commandline arguments. If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return value is None. """ # Create a parser that will understand only the args we need for an AzureConfig parser1 = create_runner_parser() parser_result = parse_args_and_add_yaml_variables(parser1, yaml_config_file=self.yaml_config_file, project_root=self.project_root, fail_on_unknown_args=False) azure_config = AzureConfig(**parser_result.args) azure_config.project_root = self.project_root self.azure_config = azure_config self.model_config = None if not azure_config.model: raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.") model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args) # Create the model as per the "model" commandline option. This can return either a built-in config # of type DeepLearningConfig, or a LightningContainer. config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model) def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult: assert isinstance(c, GenericConfig) parser = type(c).create_argparser() # For each parser, feed in the unknown settings from the previous parser. All commandline args should # be consumed by name, hence fail if there is something that is still unknown. parser_result = parse_arguments(parser, settings_from_yaml=previous_parser_result.unknown_settings_from_yaml, args=previous_parser_result.unknown, fail_on_unknown_args=True) # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline. c.apply_overrides(parser_result.known_settings_from_yaml) c.apply_overrides(parser_result.overrides) c.validate() return parser_result # Now create a parser that understands overrides at model/container level. parser_result = parse_overrides_and_apply(config_or_container, parser_result) if isinstance(config_or_container, LightningContainer): self.lightning_container = config_or_container elif isinstance(config_or_container, ModelConfigBase): # Built-in InnerEye models use a fake container self.model_config = config_or_container self.lightning_container = InnerEyeContainer(config_or_container) else: raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}") if azure_config.extra_code_directory: exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist" logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}") else: logging.info("extra_code_directory is unset") return parser_result def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]: """ The main entry point for training and testing models from the commandline. This chooses a model to train via a commandline argument, runs training or testing, and writes all required info to disk and logs. :return: If submitting to AzureML, returns the model configuration that was used for training, including commandline overrides applied (if any). """ # Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(logging.INFO if is_local_rank_zero() else "ERROR") initialize_rpdb() user_agent.append(azure_util.INNEREYE_SDK_NAME, azure_util.INNEREYE_SDK_VERSION) self.parse_and_load_model() if self.lightning_container.perform_cross_validation: if self.model_config is None: raise NotImplementedError("Cross validation for LightingContainer models is not yet supported.") # force hyperdrive usage if performing cross validation self.azure_config.hyperdrive = True run_object: Optional[Run] = None if self.azure_config.azureml: run_object = self.submit_to_azureml() else: self.run_in_situ() if self.model_config is None: return self.lightning_container, run_object return self.model_config, run_object def submit_to_azureml(self) -> Run: """ Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for completion and the Run did not succeed. """ # The adal package creates a logging.info line each time it gets an authentication token, avoid that. logging.getLogger('adal-python').setLevel(logging.WARNING) # Azure core prints full HTTP requests even in INFO mode logging.getLogger('azure').setLevel(logging.WARNING) # PyJWT prints out warnings that are beyond our control warnings.filterwarnings("ignore", category=DeprecationWarning) if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id: raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' " "property must be set.") hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config) # type: ignore source_config = SourceConfig( root_folder=self.project_root, entry_script=Path(sys.argv[0]).resolve(), conda_dependencies_files=get_all_environment_files(self.project_root), hyperdrive_config_func=hyperdrive_func, # For large jobs, upload of results can time out because of large checkpoint files. Default is 600 upload_timeout_seconds=86400, ) source_config.set_script_params_except_submit_flag() azure_run = submit_to_azureml(self.azure_config, source_config, self.lightning_container.all_azure_dataset_ids(), self.lightning_container.all_dataset_mountpoints()) logging.info("Job submission to AzureML done.") if self.azure_config.pytest_mark and self.azure_config.wait_for_completion: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) else: logging.info("No pytest_mark present, hence not downloading the pytest result file.") # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state. if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run): raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child " "runs failed.") return azure_run def print_git_tags(self) -> None: """ When running in AzureML, print all the tags that contain information about the git repository status, for answering the question "which code version was used" from a log file only. """ git_tags = get_git_tags(self.azure_config) if is_offline_run_context(RUN_CONTEXT): # When running on a VM outside AzureML, we can read git information from the current repository tags_to_print = git_tags else: # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git # information into run tags after submitting the job, and can read it out here. # Only print out those tags that were created from git-related information tags_to_print = {key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags} logging.info("Git repository information:") for key, value in tags_to_print.items(): logging.info(f" {key:20}: {value}") def run_in_situ(self) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR") suppress_logging_noise() if is_global_rank_zero(): self.print_git_tags() # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the # large models. if self.azure_config.pytest_mark: outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: # Terminate if pytest has failed. This makes the smoke test in # PR builds fail if pytest fails. pytest_failures = f"Not all PyTest tests passed. See {results_file_path}" raise ValueError(pytest_failures) else: # Set environment variables for multi-node training if needed. # In particular, the multi-node environment variables should NOT be set in single node # training, otherwise this might lead to errors with the c10 distributed backend # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395) if self.azure_config.num_nodes > 1: set_environment_variables_for_multi_node() ml_runner = self.create_ml_runner() ml_runner.setup() ml_runner.start_logging_to_file() try: ml_runner.run() finally: disable_logging_to_file() def create_ml_runner(self) -> MLRunner: """ Create and return an ML runner using the attributes of this Runner object. """ return MLRunner( model_config=self.model_config, container=self.lightning_container, azure_config=self.azure_config, project_root=self.project_root, post_cross_validation_hook=self.post_cross_validation_hook, model_deployment_hook=self.model_deployment_hook)