Example #1
0
def test_innereyecontainer_setup_passes_on_allow_incomplete_labels(
        test_output_dirs: OutputFolderForTests,
        allow_partial_ground_truth: bool) -> None:
    """
    Test that InnerEyeContainer.setup passes on the correct value of allow_incomplete_labels to
    full_image_dataset.convert_channels_to_file_paths
    :param test_output_dirs: Test fixture.
    :param allow_partial_ground_truth: The value to set allow_incomplete_labels to and check it is
    passed through.
    """
    config = DummyModel()
    config.set_output_to(test_output_dirs.root_dir)
    config.allow_incomplete_labels = allow_partial_ground_truth
    container = InnerEyeContainer(config)

    def mocked_convert_channels_to_file_paths(
            _: List[str], __: pd.DataFrame, ___: Path, ____: str,
            allow_incomplete_labels: bool) -> Tuple[List[Optional[Path]], str]:
        paths: List[Optional[Path]] = []
        failed_channel_info = ''
        assert allow_incomplete_labels == allow_partial_ground_truth
        return paths, failed_channel_info

    with mock.patch("InnerEye.ML.lightning_base.convert_channels_to_file_paths"
                    ) as convert_channels_to_file_paths_mock:
        convert_channels_to_file_paths_mock.side_effect = mocked_convert_channels_to_file_paths
        container.setup()
        convert_channels_to_file_paths_mock.assert_called()
Example #2
0
    def parse_and_load_model(self) -> ParserResult:
        """
        Parses the command line arguments, and creates configuration objects for the model itself, and for the
        Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the
        parser output from parsing the model commandline arguments.
        If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return
        value is None.
        """
        # Create a parser that will understand only the args we need for an AzureConfig
        parser1 = create_runner_parser()
        parser_result = parse_args_and_add_yaml_variables(parser1,
                                                          yaml_config_file=self.yaml_config_file,
                                                          project_root=self.project_root,
                                                          fail_on_unknown_args=False)
        azure_config = AzureConfig(**parser_result.args)
        azure_config.project_root = self.project_root
        self.azure_config = azure_config
        self.model_config = None
        if not azure_config.model:
            raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.")
        model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args)
        # Create the model as per the "model" commandline option. This can return either a built-in config
        # of type DeepLearningConfig, or a LightningContainer.
        config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model)

        def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult:
            assert isinstance(c, GenericConfig)
            parser = type(c).create_argparser()
            # For each parser, feed in the unknown settings from the previous parser. All commandline args should
            # be consumed by name, hence fail if there is something that is still unknown.
            parser_result = parse_arguments(parser,
                                            settings_from_yaml=previous_parser_result.unknown_settings_from_yaml,
                                            args=previous_parser_result.unknown,
                                            fail_on_unknown_args=True)
            # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
            c.apply_overrides(parser_result.known_settings_from_yaml)
            c.apply_overrides(parser_result.overrides)
            c.validate()
            return parser_result

        # Now create a parser that understands overrides at model/container level.
        parser_result = parse_overrides_and_apply(config_or_container, parser_result)

        if isinstance(config_or_container, LightningContainer):
            self.lightning_container = config_or_container
        elif isinstance(config_or_container, ModelConfigBase):
            # Built-in InnerEye models use a fake container
            self.model_config = config_or_container
            self.lightning_container = InnerEyeContainer(config_or_container)
        else:
            raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}")
        if azure_config.extra_code_directory:
            exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist"
            logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}")
        else:
            logging.info("extra_code_directory is unset")
        return parser_result
Example #3
0
def get_default_checkpoint_handler(model_config: DeepLearningConfig, project_root: Path) -> CheckpointHandler:
    """
    Gets a checkpoint handler, using the given model config and the default azure configuration.
    """
    azure_config = get_default_azure_config()
    lightning_container = InnerEyeContainer(model_config)
    return CheckpointHandler(azure_config=azure_config,
                             container=lightning_container,
                             project_root=project_root)
Example #4
0
def test_model_name_for_innereye_container() -> None:
    """
    Test if the InnerEye container picks up the name of the model correctly. The name will impact the output folder
    structure that is created.
    """
    expected_name = "DummyModel"
    model = DummyModel()
    assert model.model_name == expected_name
    container = InnerEyeContainer(model)
    assert container.model_name == expected_name
Example #5
0
def test_copied_properties() -> None:
    config = ModelConfigBase(should_validate=False)
    # This field lives in DatasetParams
    config.azure_dataset_id = "foo"
    # This field lives in WorkflowParams
    config.number_of_cross_validation_splits = 5
    assert config.perform_cross_validation
    container = InnerEyeContainer(config)
    assert container.azure_dataset_id == "foo"
    assert container.perform_cross_validation
Example #6
0
def test_innereye_container_init() -> None:
    """
    Test if the constructor of the InnerEye container copies attributes as expected.
    """
    # The constructor should copy all fields that belong to either WorkflowParams or DatasetParams from the
    # config object to the container.
    for (attrib, type_) in [("weights_url", WorkflowParams),
                            ("extra_dataset_mountpoints", DatasetParams)]:
        config = ModelConfigBase(should_validate=False)
        assert hasattr(type_, attrib)
        assert hasattr(config, attrib)
        setattr(config, attrib, ["foo"])
        container = InnerEyeContainer(config)
        assert getattr(container, attrib) == ["foo"]
Example #7
0
def test_file_system_with_subfolders(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if a subfolder can be created within the output folder structure, for use with cross validation.
    """
    model = DummyModel()
    model.set_output_to(test_output_dirs.root_dir)
    container = InnerEyeContainer(model)
    # File system should be copied from model config to container
    assert container.file_system_config == model.file_system_config
    runner = MLRunner(model_config=model)
    runner.setup()
    assert str(runner.container.outputs_folder).endswith(model.model_name)
    output_subfolder = "foo"
    expected_folder = runner.container.outputs_folder / output_subfolder
    runner = MLRunner(model_config=model, output_subfolder=output_subfolder)
    runner.setup()
    assert runner.container.outputs_folder == expected_folder
def create_model_and_store_checkpoint(config: ModelConfigBase,
                                      checkpoint_path: Path,
                                      weights_only: bool = True) -> None:
    """
    Creates a Lightning model for the given model configuration, and stores it as a checkpoint file.
    If a GPU is available, the model is moved to the GPU before storing.
    The trainer properties `current_epoch` and `global_step` are set to fixed non-default values.
    :param config: The model configuration.
    :param checkpoint_path: The path and filename of the checkpoint file.
    """
    container = InnerEyeContainer(config)
    trainer, _ = create_lightning_trainer(container)
    model = create_lightning_model(config)
    if machine_has_gpu:
        model = model.cuda()  # type: ignore
    trainer.model = model
    # Before saving, the values for epoch and step are incremented. Save them here in such a way that we can assert
    # easily later. We can't mock that because otherwise the mock object would be written to disk (that fails)
    trainer.fit_loop.current_epoch = FIXED_EPOCH - 1  # type: ignore
    trainer.fit_loop.global_step = FIXED_GLOBAL_STEP - 1  # type: ignore
    # In PL, it is the Trainer's responsibility to save the model. Checkpoint handling refers back to the trainer
    # to get a save_func. Mimicking that here.
    trainer.save_checkpoint(checkpoint_path, weights_only=weights_only)
class Runner:
    """
    This class contains the high-level logic to start a training run: choose a model configuration by name,
    submit to AzureML if needed, or otherwise start the actual training and test loop.
    :param project_root: The root folder that contains all of the source code that should be executed.
    :param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
    :param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs.
    The function is called with the model configuration and the path to the downloaded and merged metrics files.
    :param model_deployment_hook: an optional function for deploying a model in an application-specific way.
    If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
    Model as arguments, and return an optional Path and a further object of any type.
    """
    def __init__(self,
                 project_root: Path,
                 yaml_config_file: Path,
                 post_cross_validation_hook: Optional[
                     PostCrossValidationHookSignature] = None,
                 model_deployment_hook: Optional[
                     ModelDeploymentHookSignature] = None):
        self.project_root = project_root
        self.yaml_config_file = yaml_config_file
        self.post_cross_validation_hook = post_cross_validation_hook
        self.model_deployment_hook = model_deployment_hook
        # model_config and azure_config are placeholders for now, and are set properly when command line args are
        # parsed.
        self.model_config: Optional[DeepLearningConfig] = None
        self.azure_config: AzureConfig = AzureConfig()
        self.lightning_container: LightningContainer = None  # type: ignore
        # This field stores the MLRunner object that has been created in the most recent call to the run() method.
        self.ml_runner: Optional[MLRunner] = None

    def parse_and_load_model(self) -> ParserResult:
        """
        Parses the command line arguments, and creates configuration objects for the model itself, and for the
        Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the
        parser output from parsing the model commandline arguments.
        If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return
        value is None.
        """
        # Create a parser that will understand only the args we need for an AzureConfig
        parser1 = create_runner_parser()
        parser_result = parse_args_and_add_yaml_variables(
            parser1,
            yaml_config_file=self.yaml_config_file,
            project_root=self.project_root,
            fail_on_unknown_args=False)
        azure_config = AzureConfig(**parser_result.args)
        azure_config.project_root = self.project_root
        self.azure_config = azure_config
        self.model_config = None
        if not azure_config.model:
            raise ValueError(
                "Parameter 'model' needs to be set to tell InnerEye which model to run."
            )
        model_config_loader: ModelConfigLoader = ModelConfigLoader(
            **parser_result.args)
        # Create the model as per the "model" commandline option. This can return either a built-in config
        # of type DeepLearningConfig, or a LightningContainer.
        config_or_container = model_config_loader.create_model_config_from_name(
            model_name=azure_config.model)

        def parse_overrides_and_apply(
                c: object,
                previous_parser_result: ParserResult) -> ParserResult:
            assert isinstance(c, GenericConfig)
            parser = type(c).create_argparser()
            # For each parser, feed in the unknown settings from the previous parser. All commandline args should
            # be consumed by name, hence fail if there is something that is still unknown.
            parser_result = parse_arguments(
                parser,
                settings_from_yaml=previous_parser_result.
                unknown_settings_from_yaml,
                args=previous_parser_result.unknown,
                fail_on_unknown_args=True)
            # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
            c.apply_overrides(parser_result.known_settings_from_yaml)
            c.apply_overrides(parser_result.overrides)
            c.validate()
            return parser_result

        # Now create a parser that understands overrides at model/container level.
        parser_result = parse_overrides_and_apply(config_or_container,
                                                  parser_result)

        if isinstance(config_or_container, LightningContainer):
            self.lightning_container = config_or_container
        elif isinstance(config_or_container, ModelConfigBase):
            # Built-in InnerEye models use a fake container
            self.model_config = config_or_container
            self.lightning_container = InnerEyeContainer(config_or_container)
        else:
            raise ValueError(
                f"Don't know how to handle a loaded configuration of type {type(config_or_container)}"
            )

        # Allow overriding AzureConfig params from within the container.
        self.lightning_container.update_azure_config(self.azure_config)

        if azure_config.extra_code_directory:
            exist = "exists" if Path(azure_config.extra_code_directory).exists(
            ) else "does not exist"
            logging.info(
                f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}"
            )
        else:
            logging.info("extra_code_directory is unset")
        return parser_result

    def run(self) -> Tuple[Optional[DeepLearningConfig], AzureRunInfo]:
        """
        The main entry point for training and testing models from the commandline. This chooses a model to train
        via a commandline argument, runs training or testing, and writes all required info to disk and logs.
        :return: If submitting to AzureML, returns the model configuration that was used for training,
        including commandline overrides applied (if any).
        """
        # Usually, when we set logging to DEBUG, we want diagnostics about the model
        # build itself, but not the tons of debug information that AzureML submissions create.
        logging_to_stdout(logging.INFO if is_local_rank_zero() else "ERROR")
        initialize_rpdb()
        user_agent.append(azure_util.INNEREYE_SDK_NAME,
                          azure_util.INNEREYE_SDK_VERSION)
        self.parse_and_load_model()
        if self.lightning_container.perform_cross_validation:
            # force hyperdrive usage if performing cross validation
            self.azure_config.hyperdrive = True
        azure_run_info = self.submit_to_azureml_if_needed()
        self.run_in_situ(azure_run_info)
        if self.model_config is None:
            return self.lightning_container, azure_run_info
        return self.model_config, azure_run_info

    def submit_to_azureml_if_needed(self) -> AzureRunInfo:
        """
        Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
        completion and the Run did not succeed.
        """
        if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \
                and not self.lightning_container.azure_dataset_id:
            raise ValueError(
                "When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
                "property must be set.")
        # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
        env_variables = {
            "CUBLAS_WORKSPACE_CONFIG": ":4096:8"
        } if self.lightning_container.pl_deterministic else {}
        source_config = SourceConfig(
            root_folder=self.project_root,
            entry_script=Path(sys.argv[0]).resolve(),
            script_params=sys.argv[1:],
            conda_dependencies_files=get_all_environment_files(
                self.project_root),
            hyperdrive_config_func=(
                self.model_config.get_hyperdrive_config if self.model_config
                else self.lightning_container.get_hyperdrive_config),
            # For large jobs, upload of results can time out because of large checkpoint files. Default is 600
            upload_timeout_seconds=86400,
            environment_variables=env_variables)
        # Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
        # when running pytest.
        ignored_folders = []
        if not self.azure_config.pytest_mark:
            ignored_folders.extend(["Tests", "TestsOutsidePackage"])
        if not self.lightning_container.regression_test_folder:
            ignored_folders.append("RegressionTestResults")

        all_local_datasets = self.lightning_container.all_local_dataset_paths()
        input_datasets = \
            create_dataset_configs(self.azure_config,
                                   all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(),
                                   all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints(),
                                   all_local_datasets=all_local_datasets)  # type: ignore

        def after_submission_hook(azure_run: Run) -> None:
            """
            A function that will be called right after job submission.
            """
            # Set the default display name to what was provided as the "tag". This will affect single runs
            # and Hyperdrive parent runs
            if self.azure_config.tag:
                azure_run.display_name = self.azure_config.tag
            # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
            # run in cross validation analysis
            recovery_id = create_run_recovery_id(azure_run)
            azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
            print(
                "If this run fails, re-start runner.py and supply these additional arguments: "
                f"--run_recovery_id={recovery_id}")
            if self.azure_config.tensorboard:
                print(
                    "Starting TensorBoard now because you specified --tensorboard"
                )
                monitor(monitor_config=AMLTensorBoardMonitorConfig(
                    run_ids=[azure_run.id]),
                        azure_config=self.azure_config)
            else:
                print(
                    f"To monitor this run locally using TensorBoard, run the script: "
                    f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}"
                )

            if self.azure_config.wait_for_completion:
                # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
                # may need to download the pytest result file.
                azure_run.wait_for_completion(show_output=True,
                                              raise_on_error=False)
                if self.azure_config.pytest_mark:
                    # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
                    # A build step will pick up that file and publish it to Azure DevOps.
                    # If pytest_mark is set, this file must exist.
                    logging.info("Downloading pytest result file.")
                    download_pytest_result(azure_run)
                if azure_run.status == RunStatus.FAILED:
                    raise ValueError(
                        f"The AzureML run failed. Please check this URL for details: "
                        f"{azure_run.get_portal_url()}")

        hyperdrive_config = None
        if self.azure_config.hyperdrive:
            hyperdrive_config = self.lightning_container.get_hyperdrive_config(
                ScriptRunConfig(source_directory=""))

        # Create a temporary file for the merged conda file, that will be removed after submission of the job.
        temp_conda: Optional[Path] = None
        try:
            if len(source_config.conda_dependencies_files) > 1:
                temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
                # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not
                # be necessary if the innereye package is installed. It is necessary when working with an outer project
                # and InnerEye as a git submodule and submitting jobs from the local machine.
                # In case of version conflicts, the package version in the outer project is given priority.
                merge_conda_files(source_config.conda_dependencies_files,
                                  temp_conda)

            # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so
            # we should only attempt them if we intend to elevate this to AzureML
            if self.azure_config.azureml:
                if not self.azure_config.cluster:
                    raise ValueError(
                        "self.azure_config.cluster not set, but we need a compute_cluster_name to submit"
                        "the script to run in AzureML")
                azure_run_info = submit_to_azure_if_needed(
                    entry_script=source_config.entry_script,
                    snapshot_root_directory=source_config.root_folder,
                    script_params=source_config.script_params,
                    conda_environment_file=temp_conda
                    or source_config.conda_dependencies_files[0],
                    aml_workspace=self.azure_config.get_workspace(),
                    compute_cluster_name=self.azure_config.cluster,
                    environment_variables=source_config.environment_variables,
                    default_datastore=self.azure_config.azureml_datastore,
                    experiment_name=to_azure_friendly_string(
                        create_experiment_name(self.azure_config)),
                    max_run_duration=self.azure_config.max_run_duration,
                    input_datasets=input_datasets,
                    num_nodes=self.azure_config.num_nodes,
                    wait_for_completion=False,
                    ignored_folders=ignored_folders,
                    pip_extra_index_url=self.azure_config.pip_extra_index_url,
                    submit_to_azureml=self.azure_config.azureml,
                    docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
                    docker_shm_size=self.azure_config.docker_shm_size,
                    tags=additional_run_tags(azure_config=self.azure_config,
                                             commandline_args=" ".join(
                                                 source_config.script_params)),
                    after_submission=after_submission_hook,
                    hyperdrive_config=hyperdrive_config)
                if self.azure_config.tag and azure_run_info.run:
                    if self.lightning_container.perform_cross_validation:
                        # This code is only reached inside Azure. Set display name again - this will now affect
                        # Hypdrive child runs (for other jobs, this has already been done after submission)
                        cv_index = self.lightning_container.cross_validation_split_index
                        full_display_name = f"{self.azure_config.tag} {cv_index}"
                        azure_run_info.run.display_name = full_display_name
            else:
                azure_run_info = submit_to_azure_if_needed(
                    input_datasets=input_datasets, submit_to_azureml=False)
        finally:
            if temp_conda:
                temp_conda.unlink()
        # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
        # the script locally or in AzureML.
        return azure_run_info

    def print_git_tags(self) -> None:
        """
        When running in AzureML, print all the tags that contain information about the git repository status,
        for answering the question "which code version was used" from a log file only.
        """
        git_tags = get_git_tags(self.azure_config)
        if is_offline_run_context(RUN_CONTEXT):
            # When running on a VM outside AzureML, we can read git information from the current repository
            tags_to_print = git_tags
        else:
            # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git
            # information into run tags after submitting the job, and can read it out here.
            # Only print out those tags that were created from git-related information
            tags_to_print = {
                key: value
                for key, value in RUN_CONTEXT.get_tags().items()
                if key in git_tags
            }
        logging.info("Git repository information:")
        for key, value in tags_to_print.items():
            logging.info(f"    {key:20}: {value}")

    def run_in_situ(self, azure_run_info: AzureRunInfo) -> None:
        """
        Actually run the AzureML job; this method will typically run on an Azure VM.
        :param azure_run_info: Contains all information about the present run in AzureML, in particular where the
        datasets are mounted.
        """
        # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
        # build itself, but not the tons of debug information that AzureML submissions create.
        # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
        logging_to_stdout(
            self.azure_config.log_level if is_local_rank_zero() else "ERROR")
        package_setup_and_hacks()
        if is_global_rank_zero():
            self.print_git_tags()
        # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
        # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
        # large models.
        if self.azure_config.pytest_mark:
            outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
            pytest_passed, results_file_path = run_pytest(
                self.azure_config.pytest_mark, outputs_folder)
            if not pytest_passed:
                # Terminate if pytest has failed. This makes the smoke test in
                # PR builds fail if pytest fails.
                pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
                raise ValueError(pytest_failures)
        else:
            # Set environment variables for multi-node training if needed. This function will terminate early
            # if it detects that it is not in a multi-node environment.
            set_environment_variables_for_multi_node()
            self.ml_runner = self.create_ml_runner()
            self.ml_runner.setup(azure_run_info)
            self.ml_runner.run()

    def create_ml_runner(self) -> MLRunner:
        """
        Create and return an ML runner using the attributes of this Runner object.
        """
        return MLRunner(
            model_config=self.model_config,
            container=self.lightning_container,
            azure_config=self.azure_config,
            project_root=self.project_root,
            post_cross_validation_hook=self.post_cross_validation_hook,
            model_deployment_hook=self.model_deployment_hook)
Example #10
0
def test_convert_channels_to_file_paths(
        default_config: ModelConfigBase) -> None:
    """
    Test unit for missing channels and missing files.
    """
    # Sets DummyModel config and container
    container = InnerEyeContainer(default_config)

    # 1 Should not return any errors given that no channels or files are missing
    container.setup()

    # 2 Creates InnerEyeContainer object with: missing channels: "channel1", "channel2", 'mask'
    # for patients 1 and 4. Wrong file name for channel "mask", patient 2. Error report is generated
    # for patients 1,2,4. Patient 3 should not generate any error as no channel, file is missing.

    # Commented lines show missing and expected error reporting.
    data_frame = [
        # ["1",  "train_and_test_data/id1_channel1.nii.gz", "channel1",  "1"],
        # ["1",  "train_and_test_data/id1_channel1.nii.gz", "channel2",  "1"],
        # ["1",  "train_and_test_data/id1_mask.nii.gz",     "mask",      "1"],
        ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"],
        ["1", "train_and_test_data/id1_region.nii.gz", "region_1", "1"],
        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"],
        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"],
        ["2", "FILE_A", "mask", "2"],
        ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"],
        ["2", "train_and_test_data/id2_region.nii.gz", "region_1", "2"],
        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"],
        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"],
        ["3", "train_and_test_data/id2_mask.nii.gz", "mask", "3"],
        ["3", "train_and_test_data/id2_region.nii.gz", "region", "3"],
        ["3", "train_and_test_data/id2_region.nii.gz", "region_1", "3"],
        # ["4",  "train_and_test_data/id2_channel1.nii.gz", "channel1",  "4"],
        # ["4",  "train_and_test_data/id2_channel1.nii.gz", "channel2",  "4"],
        # ["4",  "train_and_test_data/id2_mask.nii.gz",     "mask",      "4"],
        ["4", "train_and_test_data/id2_region.nii.gz", "region", "4"],
        ["4", "train_and_test_data/id2_region.nii.gz", "region_1", "4"]
    ]

    # 3 Overwrite get_model_train_test_dataset_splits method for subjects 1,2,3,4
    class MyDummyModel(DummyModel):
        def get_model_train_test_dataset_splits(
                self, dataset_df: pd.DataFrame) -> DatasetSplits:
            return DatasetSplits(
                train=dataset_df[dataset_df.subject.isin(['1'])],
                test=dataset_df[dataset_df.subject.isin(['2', '3'])],
                val=dataset_df[dataset_df.subject.isin(['4'])])

    config_missing_channels_and_files = MyDummyModel()
    data_frame_with_missing_channels_and_files = pd.DataFrame(
        data_frame,
        columns=['subject', 'filePath', 'channel', 'institutionId'])
    config_missing_channels_and_files._dataset_data_frame = data_frame_with_missing_channels_and_files
    container_missing_files_channels = InnerEyeContainer(
        config_missing_channels_and_files)
    with pytest.raises(ValueError) as e:
        container_missing_files_channels.setup()

    assert "Patient 1 does not have channel 'channel1'" in str(e.value)
    assert "Patient 1 does not have channel 'channel2'" in str(e.value)
    assert "Patient 1 does not have channel 'mask'" in str(e.value)
    assert "Patient 2" in str(e.value) and "FILE_A does not exist" in str(
        e.value)
    assert "Patient 3" not in str(e.value)
    assert "Patient 4 does not have channel 'channel1'" in str(e.value)
    assert "Patient 4 does not have channel 'channel2'" in str(e.value)
    assert "Patient 4 does not have channel 'mask'" in str(e.value)
Example #11
0
class Runner:
    """
    This class contains the high-level logic to start a training run: choose a model configuration by name,
    submit to AzureML if needed, or otherwise start the actual training and test loop.
    :param project_root: The root folder that contains all of the source code that should be executed.
    :param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
    :param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs.
    The function is called with the model configuration and the path to the downloaded and merged metrics files.
    :param model_deployment_hook: an optional function for deploying a model in an application-specific way.
    If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
    Model as arguments, and return an optional Path and a further object of any type.
    :param command_line_args: command-line arguments to use; if None, use sys.argv.
    """

    def __init__(self,
                 project_root: Path,
                 yaml_config_file: Path,
                 post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
                 model_deployment_hook: Optional[ModelDeploymentHookSignature] = None):
        self.project_root = project_root
        self.yaml_config_file = yaml_config_file
        self.post_cross_validation_hook = post_cross_validation_hook
        self.model_deployment_hook = model_deployment_hook
        # model_config and azure_config are placeholders for now, and are set properly when command line args are
        # parsed.
        self.model_config: Optional[DeepLearningConfig] = None
        self.azure_config: AzureConfig = AzureConfig()
        self.lightning_container: LightningContainer = None  # type: ignore

    def parse_and_load_model(self) -> ParserResult:
        """
        Parses the command line arguments, and creates configuration objects for the model itself, and for the
        Azure-related parameters. Sets self.azure_config and self.model_config to their proper values. Returns the
        parser output from parsing the model commandline arguments.
        If no "model" argument is provided on the commandline, self.model_config will be set to None, and the return
        value is None.
        """
        # Create a parser that will understand only the args we need for an AzureConfig
        parser1 = create_runner_parser()
        parser_result = parse_args_and_add_yaml_variables(parser1,
                                                          yaml_config_file=self.yaml_config_file,
                                                          project_root=self.project_root,
                                                          fail_on_unknown_args=False)
        azure_config = AzureConfig(**parser_result.args)
        azure_config.project_root = self.project_root
        self.azure_config = azure_config
        self.model_config = None
        if not azure_config.model:
            raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.")
        model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args)
        # Create the model as per the "model" commandline option. This can return either a built-in config
        # of type DeepLearningConfig, or a LightningContainer.
        config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model)

        def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult:
            assert isinstance(c, GenericConfig)
            parser = type(c).create_argparser()
            # For each parser, feed in the unknown settings from the previous parser. All commandline args should
            # be consumed by name, hence fail if there is something that is still unknown.
            parser_result = parse_arguments(parser,
                                            settings_from_yaml=previous_parser_result.unknown_settings_from_yaml,
                                            args=previous_parser_result.unknown,
                                            fail_on_unknown_args=True)
            # Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
            c.apply_overrides(parser_result.known_settings_from_yaml)
            c.apply_overrides(parser_result.overrides)
            c.validate()
            return parser_result

        # Now create a parser that understands overrides at model/container level.
        parser_result = parse_overrides_and_apply(config_or_container, parser_result)

        if isinstance(config_or_container, LightningContainer):
            self.lightning_container = config_or_container
        elif isinstance(config_or_container, ModelConfigBase):
            # Built-in InnerEye models use a fake container
            self.model_config = config_or_container
            self.lightning_container = InnerEyeContainer(config_or_container)
        else:
            raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}")
        if azure_config.extra_code_directory:
            exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist"
            logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}")
        else:
            logging.info("extra_code_directory is unset")
        return parser_result

    def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
        """
        The main entry point for training and testing models from the commandline. This chooses a model to train
        via a commandline argument, runs training or testing, and writes all required info to disk and logs.
        :return: If submitting to AzureML, returns the model configuration that was used for training,
        including commandline overrides applied (if any).
        """
        # Usually, when we set logging to DEBUG, we want diagnostics about the model
        # build itself, but not the tons of debug information that AzureML submissions create.
        logging_to_stdout(logging.INFO if is_local_rank_zero() else "ERROR")
        initialize_rpdb()
        user_agent.append(azure_util.INNEREYE_SDK_NAME, azure_util.INNEREYE_SDK_VERSION)
        self.parse_and_load_model()
        if self.lightning_container.perform_cross_validation:
            if self.model_config is None:
                raise NotImplementedError("Cross validation for LightingContainer models is not yet supported.")
            # force hyperdrive usage if performing cross validation
            self.azure_config.hyperdrive = True
        run_object: Optional[Run] = None
        if self.azure_config.azureml:
            run_object = self.submit_to_azureml()
        else:
            self.run_in_situ()
        if self.model_config is None:
            return self.lightning_container, run_object
        return self.model_config, run_object

    def submit_to_azureml(self) -> Run:
        """
        Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
        completion and the Run did not succeed.
        """
        # The adal package creates a logging.info line each time it gets an authentication token, avoid that.
        logging.getLogger('adal-python').setLevel(logging.WARNING)
        # Azure core prints full HTTP requests even in INFO mode
        logging.getLogger('azure').setLevel(logging.WARNING)
        # PyJWT prints out warnings that are beyond our control
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id:
            raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
                             "property must be set.")
        hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config)  # type: ignore
        source_config = SourceConfig(
            root_folder=self.project_root,
            entry_script=Path(sys.argv[0]).resolve(),
            conda_dependencies_files=get_all_environment_files(self.project_root),
            hyperdrive_config_func=hyperdrive_func,
            # For large jobs, upload of results can time out because of large checkpoint files. Default is 600
            upload_timeout_seconds=86400,
        )
        source_config.set_script_params_except_submit_flag()
        azure_run = submit_to_azureml(self.azure_config, source_config,
                                      self.lightning_container.all_azure_dataset_ids(),
                                      self.lightning_container.all_dataset_mountpoints())
        logging.info("Job submission to AzureML done.")
        if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
            # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
            # A build step will pick up that file and publish it to Azure DevOps.
            # If pytest_mark is set, this file must exist.
            logging.info("Downloading pytest result file.")
            download_pytest_result(azure_run)
        else:
            logging.info("No pytest_mark present, hence not downloading the pytest result file.")
        # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
        if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run):
            raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child "
                             "runs failed.")
        return azure_run

    def print_git_tags(self) -> None:
        """
        When running in AzureML, print all the tags that contain information about the git repository status,
        for answering the question "which code version was used" from a log file only.
        """
        git_tags = get_git_tags(self.azure_config)
        if is_offline_run_context(RUN_CONTEXT):
            # When running on a VM outside AzureML, we can read git information from the current repository
            tags_to_print = git_tags
        else:
            # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git
            # information into run tags after submitting the job, and can read it out here.
            # Only print out those tags that were created from git-related information
            tags_to_print = {key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags}
        logging.info("Git repository information:")
        for key, value in tags_to_print.items():
            logging.info(f"    {key:20}: {value}")

    def run_in_situ(self) -> None:
        """
        Actually run the AzureML job; this method will typically run on an Azure VM.
        """
        # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
        # build itself, but not the tons of debug information that AzureML submissions create.
        # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
        logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR")
        suppress_logging_noise()
        if is_global_rank_zero():
            self.print_git_tags()
        # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
        # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
        # large models.
        if self.azure_config.pytest_mark:
            outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
            pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
            if not pytest_passed:
                # Terminate if pytest has failed. This makes the smoke test in
                # PR builds fail if pytest fails.
                pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
                raise ValueError(pytest_failures)
        else:
            # Set environment variables for multi-node training if needed.
            # In particular, the multi-node environment variables should NOT be set in single node
            # training, otherwise this might lead to errors with the c10 distributed backend
            # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
            if self.azure_config.num_nodes > 1:
                set_environment_variables_for_multi_node()
            ml_runner = self.create_ml_runner()
            ml_runner.setup()
            ml_runner.start_logging_to_file()
            try:
                ml_runner.run()
            finally:
                disable_logging_to_file()

    def create_ml_runner(self) -> MLRunner:
        """
        Create and return an ML runner using the attributes of this Runner object.
        """
        return MLRunner(
            model_config=self.model_config,
            container=self.lightning_container,
            azure_config=self.azure_config,
            project_root=self.project_root,
            post_cross_validation_hook=self.post_cross_validation_hook,
            model_deployment_hook=self.model_deployment_hook)