Ejemplo n.º 1
0
    def model_inference_train_and_test(self,
                                       checkpoint_handler: CheckpointHandler,
                                       model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> \
            Tuple[Optional[InferenceMetrics], Optional[InferenceMetrics], Optional[InferenceMetrics]]:
        train_metrics = None
        val_metrics = None
        test_metrics = None

        config = self.model_config

        def run_model_test(data_split: ModelExecutionMode) -> Optional[InferenceMetrics]:
            return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler,
                              model_proc=model_proc)

        if config.perform_validation_and_test_set_inference:
            # perform inference on test set
            test_metrics = run_model_test(ModelExecutionMode.TEST)
            # perform inference on validation set
            val_metrics = run_model_test(ModelExecutionMode.VAL)

        if config.perform_training_set_inference:
            # perform inference on training set if required
            train_metrics = run_model_test(ModelExecutionMode.TRAIN)

        # log the metrics to AzureML experiment if possible. When doing ensemble runs, log to the Hyperdrive parent run,
        # so that we get the metrics of child run 0 and the ensemble separated.
        if config.is_segmentation_model and not is_offline_run_context(RUN_CONTEXT):
            run_for_logging = PARENT_RUN_CONTEXT if model_proc.ENSEMBLE_CREATION else RUN_CONTEXT
            log_metrics(val_metrics=val_metrics, test_metrics=test_metrics,  # type: ignore
                        train_metrics=train_metrics, run_context=run_for_logging)  # type: ignore

        return test_metrics, val_metrics, train_metrics
Ejemplo n.º 2
0
 def get_workspace(self) -> Workspace:
     """
     Return a workspace object for an existing Azure Machine Learning Workspace (or default from YAML).
     When running inside AzureML, the workspace that is retrieved is always the one in the current
     run context. When running outside AzureML, it is created or accessed with the service principal.
     This function will read the workspace only in the first call to this method, subsequent calls will return
     a cached value.
     Throws an exception if the workspace doesn't exist or the required fields don't lead to a uniquely
     identifiable workspace.
     :return: Azure Machine Learning Workspace
     """
     if self._workspace:
         return self._workspace
     run_context = Run.get_context()
     if is_offline_run_context(run_context):
         if self.subscription_id and self.resource_group:
             service_principal_auth = self.get_service_principal_auth()
             self._workspace = Workspace.get(
                 name=self.workspace_name,
                 auth=service_principal_auth,
                 subscription_id=self.subscription_id,
                 resource_group=self.resource_group)
         else:
             raise ValueError(
                 "The values for 'subscription_id' and 'resource_group' were not found. "
                 "Was the Azure setup completed?")
     else:
         self._workspace = run_context.experiment.workspace
     return self._workspace
Ejemplo n.º 3
0
    def mount_or_download_dataset(self) -> Path:
        """
        Makes the dataset that the model uses available on the executing machine. If the present training run is outside
        of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
        taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure
        into the local repository, in the "datasets" folder.
        If the training run is inside of AzureML, the dataset that was specified at job submission time will be
        mounted or downloaded.
        Returns the path of the dataset on the executing machine.
        """
        azure_dataset_id = self.model_config.azure_dataset_id

        if is_offline_run_context(RUN_CONTEXT):
            # The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
            # Otherwise, download the dataset specified by the azure_dataset_id
            local_dataset = self.model_config.local_dataset
            if (not azure_dataset_id) and (local_dataset is None):
                raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
            if local_dataset:
                expected_dir = Path(local_dataset)
                if not expected_dir.is_dir():
                    raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
                logging.info(f"Model training will use the local dataset provided in {expected_dir}")
                return expected_dir
            return download_dataset(azure_dataset_id=azure_dataset_id,
                                    target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
                                    azure_config=self.azure_config)

        # Inside of AzureML, datasets can be either mounted or downloaded.
        if not azure_dataset_id:
            raise ValueError("The model must contain azure_dataset_id for running on AML")
        mounted = try_to_mount_input_dataset(RUN_CONTEXT)
        if not mounted:
            raise ValueError("Unable to mount or download input dataset.")
        return mounted
Ejemplo n.º 4
0
 def download_or_get_local_file(self,
                                run: Optional[Run],
                                blob_to_download: PathOrString,
                                destination: Path,
                                local_src_subdir: Optional[Path] = None) -> Optional[Path]:
     """
     Downloads a file from the results folder of an AzureML run, or copies it from a local results folder.
     Returns the path to the downloaded file if it exists, or None if the file was not found.
     If the blobs_path contains folders, the same folder structure will be created inside the destination folder.
     For example, downloading "foo.txt" to "/c/temp" will create "/c/temp/foo.txt". Downloading "foo/bar.txt"
     to "/c/temp" will create "/c/temp/foo/bar.txt"
     :param blob_to_download: path of data to download within the run
     :param destination: directory to write to
     :param run: The AzureML run to download from.
     :param local_src_subdir: if not None, then if we copy from a local results folder, that folder is
     self.outputs_directory/local_src_subdir/blob_to_download instead of self.outputs_directory/blob_to_download
     :return: The path to the downloaded file, or None if the file was not found.
     """
     blob_path = Path(blob_to_download)
     blob_parent = blob_path.parent
     if blob_parent != Path("."):
         destination = destination / blob_parent
     downloaded_file = destination / blob_path.name
     # If we've already downloaded the data, leave it as it is
     if downloaded_file.exists():
         logging.info(f"Download of '{blob_path}' to '{downloaded_file}: not needed, already exists'")
         return downloaded_file
     logging.info(f"Download of '{blob_path}' to '{downloaded_file}': proceeding")
     # If the provided run is the current run, then there is nothing to download.
     # Just copy the provided path in the outputs directory to the destination.
     if not destination.exists():
         destination.mkdir(parents=True)
     if run is None or Run.get_context().id == run.id or is_parent_run(run) or is_offline_run_context(run):
         if run is None:
             assert self.local_run_results is not None, "Local run results must be set in unit testing"
             local_src = Path(self.local_run_results)
             if self.local_run_result_split_suffix:
                 local_src = local_src / self.local_run_result_split_suffix
         else:
             local_src = Path(self.outputs_directory)
         if local_src_subdir is not None:
             local_src = local_src / local_src_subdir
         local_src = local_src / blob_path
         if local_src.exists():
             logging.info(f"Copying files from {local_src} to {destination}")
             return Path(shutil.copy(local_src, destination))
         return None
     else:
         try:
             return download_outputs_from_run(
                 blobs_path=blob_path,
                 destination=destination,
                 run=run,
                 is_file=True
             )
         except Exception as ex:
             logging.warning(f"File {blob_to_download} not found in output of run {run.id}: {ex}")
             return None
Ejemplo n.º 5
0
 def log_image(self, name: str, path: str) -> None:
     """
     Logs a PNG image stored in `path` to Azure and Tensorboard.
     """
     if not is_offline_run_context(RUN_CONTEXT):
         RUN_CONTEXT.log_image(name=name, path=path)
     writer = self.tensorboard_logger
     img = Image.open(path).convert("RGB")
     img = np.transpose(np.asarray(img), (2, 0, 1))
     writer.add_image(name, img, self.epoch)
def compare_folder_contents(expected_folder: Path,
                            csv_relative_tolerance: float,
                            actual_folder: Optional[Path] = None,
                            run: Optional[Run] = None) -> List[str]:
    """
    Compares a set of files in a folder, against files in either the other folder or files stored in the given
    AzureML run. Each file that is present in the "expected" folder must be also present in the "actual" folder
    (or the AzureML run), with exactly the same contents, in the same folder structure.
    For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
    "<actual>/foo/bar/contents.txt"

    :param expected_folder: A folder with files that are expected to be present.
    :param actual_folder: The output folder with the actually produced files.
    :param run: An AzureML run
    :param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
    If 0.0, do not allow any discrepancy.
    :return: A list of human readable error messages, with message and file path. If no errors are found, the list is
    empty.
    """
    messages = []
    if run and is_offline_run_context(run):
        logging.warning(
            "Skipping file comparison because the given run context is an AzureML offline run."
        )
        return []
    files_in_run: List[str] = run.get_file_names() if run else []
    temp_folder = Path(tempfile.mkdtemp()) if run else None
    for file in expected_folder.rglob("*"):
        # rglob also returns folders, skip those
        if file.is_dir():
            continue
        # All files stored in AzureML runs use Linux-style path
        file_relative = file.relative_to(expected_folder).as_posix()
        if actual_folder:
            actual_file = actual_folder / file_relative
        elif temp_folder is not None and run is not None:
            actual_file = temp_folder / file_relative
            if file_relative in files_in_run:
                run.download_file(name=str(file_relative),
                                  output_file_path=str(actual_file))
        else:
            raise ValueError(
                "One of the two arguments run, actual_folder must be provided."
            )
        message = compare_files(expected=file,
                                actual=actual_file,
                                csv_relative_tolerance=csv_relative_tolerance
                                ) if actual_file.exists() else MISSING_FILE
        if message:
            messages.append(f"{message}: {file_relative}")
        logging.info(f"File {file_relative}: {message or 'OK'}")
    if temp_folder:
        shutil.rmtree(temp_folder)
    return messages
Ejemplo n.º 7
0
def score_image(args: ScorePipelineConfig) -> Path:
    """
    Perform model inference on a single image. By doing the following:
    1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer)
    2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot
    3) Store the segmentation file in the current directory
    4) Upload the segmentation to AML
    :param args:
    :return:
    """
    logging.getLogger().setLevel(logging.INFO)
    score_py_folder = Path(__file__).parent
    model_folder = Path(args.model_folder or str(score_py_folder))

    run_context = Run.get_context()
    logging.info(f"Run context={run_context.id}")

    if args.use_dicom:
        # Only a single zip file is supported.
        if len(args.image_files) > 1:
            raise ValueError("Supply exactly one zip file in args.images.")
        input_zip_file = check_input_file(args.data_folder,
                                          args.image_files[0])
        reference_series_folder = model_folder / "temp_extraction"
        nifti_filename = model_folder / "temp_nifti.nii.gz"
        convert_zipped_dicom_to_nifti(input_zip_file, reference_series_folder,
                                      nifti_filename)
        test_images = [nifti_filename]
    else:
        test_images = [
            check_input_file(args.data_folder, file)
            for file in args.image_files
        ]

    images = [load_nifti_image(file) for file in test_images]

    inference_pipeline, config = init_from_model_inference_json(
        model_folder, args.use_gpu)
    segmentation = run_inference(images, inference_pipeline, config)

    segmentation_file_name = model_folder / args.result_image_name
    result_dst = store_as_ubyte_nifti(segmentation, images[0].header,
                                      segmentation_file_name)

    if args.use_dicom:
        result_dst = convert_nifti_to_zipped_dicom_rt(
            result_dst, reference_series_folder, model_folder, config,
            args.result_zip_dicom_name, args.model_id)

    if not is_offline_run_context(run_context):
        upload_file_name = args.result_zip_dicom_name if args.use_dicom else args.result_image_name
        run_context.upload_file(upload_file_name, str(result_dst))
    logging.info(f"Segmentation completed: {result_dst}")
    return result_dst
 def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
     """
     Creates new file system settings (outputs folder, logs folder) based on the information stored in the
     present object. If any of the folders do not yet exist, they are created.
     :param project_root: The root folder for the codebase that triggers the training run.
     """
     self.file_system_config = DeepLearningFileSystemConfig.create(
         project_root=project_root,
         model_name=self.model_name,
         is_offline_run=is_offline_run_context(RUN_CONTEXT),
         output_to=self.output_to
     )
Ejemplo n.º 9
0
 def log_to_azure(self, label: str, metric: float) -> None:
     """
     Logs a metric as a key/value pair to AzureML.
     """
     if not is_offline_run_context(RUN_CONTEXT):
         metric_name = self.logging_prefix + label
         RUN_CONTEXT.log(metric_name, metric)
         # When running in a cross validation setting, log all metrics to the hyperdrive parent run too,
         # so that we can easily overlay graphs across runs.
         if self.log_to_parent_run and PARENT_RUN_CONTEXT:
             if self.cross_validation_split_index > DEFAULT_CROSS_VALIDATION_SPLIT_INDEX:
                 PARENT_RUN_CONTEXT.log(f"{metric_name}_Split{self.cross_validation_split_index}",
                                        metric)
    def run(self) -> None:
        if self._interval_seconds <= 0:
            logging.warning(
                "Resource monitoring requires an interval that is larger than 0 seconds, but "
                "got: {}. Exiting.".format(self._interval_seconds))
        logging.info("Process ({}) started with pid: {}".format(
            self.name, self.pid))
        # create the TB writers and AML run context for this process
        writer = tensorboardX.SummaryWriter(self._tb_log_file_path)
        run_context = Run.get_context()
        is_offline_run = is_offline_run_context(run_context)
        current_iteration = 0

        def log_to_azure_and_tb(label: str, value: float) -> None:
            writer.add_scalar(label, value, current_iteration)
            if not is_offline_run:
                run_context.log(label, value)

        gpu_available = is_gpu_available()
        while True:
            if gpu_available:
                gpus: List[GPU] = GPUtil.getGPUs()
                if len(gpus) > 0:
                    for gpu in gpus:
                        log_to_azure_and_tb(
                            'Diagnostics/GPU_{}_Load_Percent'.format(gpu.id),
                            gpu.load * 100)
                        log_to_azure_and_tb(
                            'Diagnostics/GPU_{}_MemUtil_Percent'.format(
                                gpu.id), gpu.memoryUtil * 100)
                    # log the average GPU usage
                    log_to_azure_and_tb(
                        'Diagnostics/Average_GPU_Load_Percent',
                        statistics.mean(map(lambda x: x.load, gpus)) * 100)
                    log_to_azure_and_tb(
                        'Diagnostics/Average_GPU_MemUtil_Percent',
                        statistics.mean(map(lambda x: x.memoryUtil, gpus)) *
                        100)

            # log the CPU util
            log_to_azure_and_tb('Diagnostics/CPU_Util_Percent',
                                psutil.cpu_percent(interval=None))
            log_to_azure_and_tb('Diagnostics/CPU_MemUtil_Percent',
                                psutil.virtual_memory()[2])

            current_iteration += 1
            # pause the thread for the requested delay
            time.sleep(self._interval_seconds)
Ejemplo n.º 11
0
 def print_git_tags(self) -> None:
     """
     When running in AzureML, print all the tags that contain information about the git repository status,
     for answering the question "which code version was used" from a log file only.
     """
     git_tags = get_git_tags(self.azure_config)
     if is_offline_run_context(RUN_CONTEXT):
         # When running on a VM outside AzureML, we can read git information from the current repository
         tags_to_print = git_tags
     else:
         # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git
         # information into run tags after submitting the job, and can read it out here.
         # Only print out those tags that were created from git-related information
         tags_to_print = {key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags}
     logging.info("Git repository information:")
     for key, value in tags_to_print.items():
         logging.info(f"    {key:20}: {value}")
Ejemplo n.º 12
0
def score_image(args: ScorePipelineConfig) -> Path:
    """
    Perform model inference on a single image. By doing the following:
    1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer)
    2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot
    3) Store the segmentation file in the current directory
    4) Upload the segmentation to AML
    :param args:
    :return:
    """
    logging.getLogger().setLevel(logging.INFO)
    score_py_folder = Path(__file__).parent
    model_folder = Path(args.model_folder or str(score_py_folder))

    run_context = Run.get_context()
    logging.info(f"Run context={run_context.id}")

    test_images = []
    data_folder = args.data_folder
    for file in args.image_files:
        full_file_path = data_folder / file
        if not full_file_path.exists():
            message = \
                str(data_folder) if data_folder.is_absolute() else f"{data_folder}, absolute: {data_folder.absolute()}"
            raise ValueError(
                f"File {file} does not exist in data folder {message}")
        test_images.append(full_file_path)
    images = [load_nifti_image(file) for file in test_images]
    inference_pipeline, config = init_from_model_inference_json(
        model_folder, args.use_gpu)
    segmentation = run_inference(images, inference_pipeline, config)

    segmentation_file_name = str(model_folder / args.result_image_name)
    result_dst = store_as_ubyte_nifti(segmentation, images[0].header,
                                      segmentation_file_name)
    if not is_offline_run_context(run_context):
        run_context.upload_file(args.result_image_name, segmentation_file_name)
    logging.info(f"Segmentation completed: {result_dst}")
    return result_dst
Ejemplo n.º 13
0
def score_image(args: ScorePipelineConfig) -> Path:
    """
    Perform model inference on a single image. By doing the following:
    1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer)
    2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot
    3) Store the segmentation file in the current directory
    4) Upload the segmentation to AML
    :param args:
    :return:
    """
    logging.getLogger().setLevel(logging.INFO)
    project_root = Path(args.project_root)

    # copy the model to the current directory
    copy_tree(args.data_root, str(project_root))
    logging.info(
        f'Copied contents of data_root: {args.data_root} to {project_root}')

    run_context = Run.get_context()
    logging.info(f"Run context={run_context.id}")

    images = [
        load_nifti_image(project_root / DEFAULT_DATA_FOLDER / x)
        for x in args.test_image_channels
    ]
    inference_pipeline, config = init_from_model_inference_json(
        project_root, args.use_gpu)
    segmentation = run_inference(images, inference_pipeline, config)

    segmentation_file_name = str(project_root / args.result_image_name)
    result_dst = store_as_ubyte_nifti(segmentation, images[0].header,
                                      segmentation_file_name)
    if not is_offline_run_context(run_context):
        run_context.upload_file(args.result_image_name, segmentation_file_name)
    logging.info(f"Segmentation completed: {result_dst}")

    return Path(result_dst)
 def __init__(self) -> None:
     super().__init__()
     self.is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
Ejemplo n.º 15
0
    def register_segmentation_model(
            self, checkpoint_paths: List[Path], model_description: str,
            model_proc: ModelProcessing
    ) -> Tuple[Optional[Model], Optional[Any]]:
        """
        Registers a new model in the workspace's model registry to be deployed further,
        and creates a model zip for portal deployment (if required). This model is the
        model checkpoint with the highest test accuracy.
        :param model_description: A string description that is added to the deployed model. It would usually contain
        the test set performance and information at which epoch the result was achieved.
        :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML.
        :param model_proc: whether it's a single or ensemble model.
        :returns Tuple element 1: AML model object, or None if no model could be registered.
        Tuple element 2: The result of running the model_deployment_hook, or None if no hook was supplied.
        """
        is_offline_run = is_offline_run_context(RUN_CONTEXT)
        workspace = None
        # Terminate early if this is running outside AzureML, and we can't access the AzureML workspace. This
        # saves time copying around files.
        if is_offline_run:
            try:
                workspace = self.azure_config.get_workspace()
            except Exception:
                logging.warning(
                    "Unable to retrieve AzureML workspace. Was the Azure setup completed?"
                )
                logging.info("No model was registered in AzureML.")
                return None, None
        # The files for the final model can't live in the outputs folder. If they do: when registering the model,
        # the files may not yet uploaded by hosttools, and that may (or not) cause errors. Hence, place the folder
        # for the final models outside of "outputs", and upload manually.
        model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER
        final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder
        # Copy all code from project and InnerEye into the model folder, and copy over checkpoints.
        # This increases the size of the data stored for the run. The other option would be to store all checkpoints
        # right in the final model folder - however, then that would also contain any other checkpoints that the model
        # produced or downloaded for recovery, bloating the final model file.
        self.copy_child_paths_to_folder(final_model_folder, checkpoint_paths)
        logging.info("Registering the model on the workspace.")
        if is_offline_run:
            model_description = model_description + f"\nModel built by {self.azure_config.build_user} outside AzureML"
            model = Model.register(workspace=workspace,
                                   model_name=self.model_config.model_name,
                                   model_path=str(final_model_folder),
                                   description=model_description)
        else:
            # This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model"
            artifacts_path = model_subfolder
            # If the present run is a child run of a Hyperdrive parent run, and we are building an ensemble model,
            # register it the model on the parent run.
            if PARENT_RUN_CONTEXT and model_proc == ModelProcessing.ENSEMBLE_CREATION:
                run_to_register_on = PARENT_RUN_CONTEXT
                logging.info(
                    f"Registering the model on the parent run {run_to_register_on.id}"
                )
            else:
                run_to_register_on = RUN_CONTEXT
                logging.info(
                    f"Registering the model on the current run {run_to_register_on.id}"
                )
            logging.info(
                f"Uploading files in {final_model_folder} with prefix '{artifacts_path}'"
            )
            final_model_folder_relative = final_model_folder.relative_to(
                Path.cwd())
            run_to_register_on.upload_folder(
                name=artifacts_path, path=str(final_model_folder_relative))
            # When registering the model on the run, we need to provide a relative path inside of the run's output
            # folder in `model_path`
            model = run_to_register_on.register_model(
                model_name=self.model_config.model_name,
                model_path=artifacts_path,
                tags=RUN_CONTEXT.get_tags(),
                description=model_description)

        deployment_result = None
        logging.info(
            f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}"
        )
        # update the run's tags with the registered model information
        if not is_offline_run:
            update_run_tags(RUN_CONTEXT, {MODEL_ID_KEY_NAME: model.id})
        # create a version of the model for deployment if the hook is provided
        if self.model_deployment_hook is not None:
            assert isinstance(self.model_config, SegmentationModelBase)
            deployment_result = self.model_deployment_hook(
                self.model_config, self.azure_config, model, model_proc)
        return model, deployment_result
def model_train(config: ModelConfigBase,
                checkpoint_handler: CheckpointHandler) -> ModelTrainingResults:
    """
    The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds
    to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training.

    :param config: The arguments which specify all required information.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
    :raises TypeError: If the arguments are of the wrong type.
    :raises ValueError: When there are issues loading a previous checkpoint.
    """
    # Save the dataset files for later use in cross validation analysis
    config.write_dataset_files()

    # set the random seed for all libraries
    ml_util.set_random_seed(config.get_effective_random_seed(),
                            "Patch visualization")
    # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't
    # want training to depend on how many patients we visualized, and hence set the random seed again right after.
    with logging_section(
            "Visualizing the effect of sampling random crops for training"):
        visualize_random_crops_for_dataset(config)
    ml_util.set_random_seed(config.get_effective_random_seed(),
                            "Model training")

    logging.debug("Creating the PyTorch model.")

    # Create the train loader and validation loader to load images from the dataset
    data_loaders = config.create_data_loaders()

    # Get the path to the checkpoint to recover from
    checkpoint_path = checkpoint_handler.get_recovery_path_train()

    models_and_optimizer = ModelAndInfo(
        config=config,
        model_execution_mode=ModelExecutionMode.TRAIN,
        checkpoint_path=checkpoint_path)

    # Create the main model
    # If continuing from a previous run at a specific epoch, then load the previous model.
    model_loaded = models_and_optimizer.try_create_model_and_load_from_checkpoint(
    )
    if not model_loaded:
        raise ValueError(
            "There was no checkpoint file available for the model for given start_epoch {}"
            .format(config.start_epoch))

    # Print out a detailed breakdown of layers, memory consumption and time.
    generate_and_print_model_summary(config, models_and_optimizer.model)

    # Move model to GPU and adjust for multiple GPUs
    models_and_optimizer.adjust_model_for_gpus()

    # Create the mean teacher model and move to GPU
    if config.compute_mean_teacher_model:
        mean_teacher_model_loaded = models_and_optimizer.try_create_mean_teacher_model_load_from_checkpoint_and_adjust(
        )
        if not mean_teacher_model_loaded:
            raise ValueError(
                "There was no checkpoint file available for the mean teacher model "
                f"for given start_epoch {config.start_epoch}")

    # Create optimizer
    models_and_optimizer.create_optimizer()
    if checkpoint_handler.should_load_optimizer_checkpoint():
        optimizer_loaded = models_and_optimizer.try_load_checkpoint_for_optimizer(
        )
        if not optimizer_loaded:
            raise ValueError(
                f"There was no checkpoint file available for the optimizer for given start_epoch "
                f"{config.start_epoch}")

    # Create checkpoint directory for this run if it doesn't already exist
    logging.info(f"Models are saved at {config.checkpoint_folder}")
    if not config.checkpoint_folder.is_dir():
        config.checkpoint_folder.mkdir()

    # Create the SummaryWriters for Tensorboard
    writers = create_summary_writers(config)
    config.create_dataframe_loggers()

    # Create LR scheduler
    l_rate_scheduler = SchedulerWithWarmUp(config,
                                           models_and_optimizer.optimizer)

    # Training loop
    logging.info("Starting training")
    train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], []

    resource_monitor = None
    if config.monitoring_interval_seconds > 0:
        # initialize and start GPU monitoring
        diagnostics_events = config.logs_folder / "diagnostics"
        logging.info(
            f"Starting resource monitor, outputting to {diagnostics_events}")
        resource_monitor = ResourceMonitor(
            interval_seconds=config.monitoring_interval_seconds,
            tensorboard_folder=diagnostics_events)
        resource_monitor.start()

    gradient_scaler = GradScaler(
    ) if config.use_gpu and config.use_mixed_precision else None
    optimal_temperature_scale_values = []
    for epoch in config.get_train_epochs():
        logging.info("Starting epoch {}".format(epoch))
        save_epoch = config.should_save_epoch(
            epoch) and models_and_optimizer.optimizer is not None

        # store the learning rates used for each epoch
        epoch_lrs = l_rate_scheduler.get_last_lr()
        learning_rates_per_epoch.append(epoch_lrs)

        train_val_params: TrainValidateParameters = \
            TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN],
                                    model=models_and_optimizer.model,
                                    mean_teacher_model=models_and_optimizer.mean_teacher_model,
                                    epoch=epoch,
                                    optimizer=models_and_optimizer.optimizer,
                                    gradient_scaler=gradient_scaler,
                                    epoch_learning_rate=epoch_lrs,
                                    summary_writers=writers,
                                    dataframe_loggers=config.metrics_data_frame_loggers,
                                    in_training_mode=True)
        training_steps = create_model_training_steps(config, train_val_params)
        train_epoch_results = train_or_validate_epoch(training_steps)
        train_results_per_epoch.append(train_epoch_results.metrics)

        metrics.validate_and_store_model_parameters(writers.train, epoch,
                                                    models_and_optimizer.model)
        # Run without adjusting weights on the validation set
        train_val_params.in_training_mode = False
        train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL]
        # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs
        # as these will be re-computed after performing temperature scaling on the validation set.
        if isinstance(config, SequenceModelBase):
            train_val_params.save_metrics = not (
                save_epoch and config.temperature_scaling_config)

        training_steps = create_model_training_steps(config, train_val_params)
        val_epoch_results = train_or_validate_epoch(training_steps)
        val_results_per_epoch.append(val_epoch_results.metrics)

        if config.is_segmentation_model:
            metrics.store_epoch_stats_for_segmentation(
                config.outputs_folder, epoch, epoch_lrs,
                train_epoch_results.metrics, val_epoch_results.metrics)

        if save_epoch:
            # perform temperature scaling if required
            if isinstance(
                    config,
                    SequenceModelBase) and config.temperature_scaling_config:
                optimal_temperature, scaled_val_results = \
                    temperature_scaling_steps(config, train_val_params, val_epoch_results)
                optimal_temperature_scale_values.append(optimal_temperature)
                # overwrite the metrics for the epoch with the metrics from the temperature scaled model
                val_results_per_epoch[-1] = scaled_val_results.metrics

            models_and_optimizer.save_checkpoint(epoch)

        # Updating the learning rate should happen at the end of the training loop, so that the
        # initial learning rate will be used for the very first epoch.
        l_rate_scheduler.step()

    model_training_results = ModelTrainingResults(
        train_results_per_epoch=train_results_per_epoch,
        val_results_per_epoch=val_results_per_epoch,
        learning_rates_per_epoch=learning_rates_per_epoch,
        optimal_temperature_scale_values_per_checkpoint_epoch=
        optimal_temperature_scale_values)

    logging.info("Finished training")

    # Since we have trained the model further, let the checkpoint_handler object know so it can handle
    # checkpoints correctly.
    checkpoint_handler.additional_training_done()

    # Upload visualization directory to AML run context to be able to see it
    # in the Azure UI.
    if config.max_batch_grad_cam > 0 and config.visualization_folder.exists():
        RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER,
                                  path=str(config.visualization_folder))

    writers.close_all()
    config.metrics_data_frame_loggers.close_all()
    if resource_monitor:
        # stop the resource monitoring process
        logging.info(
            "Shutting down the resource monitor process. Aggregate resource utilization:"
        )
        for name, value in resource_monitor.read_aggregate_metrics():
            logging.info(f"{name}: {value}")
            if not is_offline_run_context(RUN_CONTEXT):
                RUN_CONTEXT.log(name, value)
        resource_monitor.kill()

    return model_training_results
Ejemplo n.º 17
0
def create_and_submit_experiment(azure_config: AzureConfig,
                                 script_run_config: ScriptRunConfig,
                                 commandline_args: str) -> Run:
    """
    Creates an AzureML experiment in the workspace and submits it for execution.
    :param azure_config: azure related configurations to setup a valid workspace.
    :param script_run_config: The configuration for the script that should be run inside of AzureML.
    :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
    used to set a tag on the submitted AzureML run.
    :returns: Run object for the submitted AzureML run
    """
    workspace = azure_config.get_workspace()
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace,
                     name=azure_util.to_azure_friendly_string(experiment_name))

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(script_run_config)

    if is_offline_run_context(run):
        # This codepath will only be executed in unit tests, when exp.submit is mocked.
        return run

    # Set metadata for the run.
    set_run_tags(run, azure_config, commandline_args=commandline_args)

    print(
        "\n=============================================================================="
    )
    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print(
        "If this run fails, re-start runner.py and supply these additional arguments: "
        f"--run_recovery_id={recovery_id}")
    print(
        f"The run recovery ID has been written to this file: {recovery_file}")
    print(
        "=============================================================================="
    )
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]),
                azure_config=azure_config)
    else:
        print(
            f"To monitor this run locally using TensorBoard, run the script: "
            f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print(
            "=============================================================================="
        )
    return run
def create_lightning_trainer(container: LightningContainer,
                             resume_from_checkpoint: Optional[Path] = None,
                             num_nodes: int = 1,
                             multiple_trainloader_mode: str = "max_size_cycle") -> \
        Tuple[Trainer, StoringLogger]:
    """
    Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
    and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
    return value.
    :param container: The container with model and data.
    :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
    :param num_nodes: The number of nodes to use in distributed training.
    :return: A tuple [Trainer object, diagnostic logger]
    """
    logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
    num_gpus = container.num_gpus_per_node()
    effective_num_gpus = num_gpus * num_nodes
    strategy = None
    if effective_num_gpus == 0:
        accelerator = "cpu"
        devices = 1
        message = "CPU"
    else:
        accelerator = "gpu"
        devices = num_gpus
        message = f"{devices} GPU"
        if effective_num_gpus > 1:
            # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of
            # GPU memory).
            # Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin
            # prints out lengthy warnings about the performance impact of find_unused_parameters.
            strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters)
            message += "s per node with DDP"
    logging.info(f"Using {message}")
    tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
    loggers = [tensorboard_logger, AzureMLLogger(False)]
    storing_logger = StoringLogger()
    loggers.append(storing_logger)
    # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
    precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
    # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
    # https://pytorch.org/docs/stable/notes/randomness.html
    # Note that switching to deterministic models can have large performance downside.
    if container.pl_deterministic:
        deterministic = True
        benchmark = False
    else:
        deterministic = False
        benchmark = True

    # The last checkpoint is considered the "best" checkpoint. For large segmentation
    # models, this still appears to be the best way of choosing them because validation loss on the relatively small
    # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
    # not for the HeadAndNeck model.
    # Note that "last" is somehow a misnomer, it should rather be "latest". There is a "last" checkpoint written in
    # every epoch. We could use that for recovery too, but it could happen that the job gets preempted right during
    # writing that file, and we would end up with an invalid file.
    last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
                                               save_last=True,
                                               save_top_k=0)
    recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
                                                   filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
                                                   every_n_val_epochs=container.autosave_every_n_val_epochs,
                                                   save_last=False)
    callbacks: List[Callback] = [
        last_checkpoint_callback,
        recovery_checkpoint_callback,
    ]
    if container.monitor_loading:
        # TODO antonsc: Remove after fixing the callback.
        raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.")
        # callbacks.append(BatchTimeCallback())
    if num_gpus > 0 and container.monitor_gpu:
        logging.info("Adding monitoring for GPU utilization")
        callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True))
    # Add the additional callbacks that were specified in get_trainer_arguments for LightningContainers
    additional_args = container.get_trainer_arguments()
    # Callbacks can be specified via the "callbacks" argument (the legacy behaviour) or the new get_callbacks method
    if "callbacks" in additional_args:
        more_callbacks = additional_args.pop("callbacks")
        if isinstance(more_callbacks, list):
            callbacks.extend(more_callbacks)  # type: ignore
        else:
            callbacks.append(more_callbacks)  # type: ignore
    callbacks.extend(container.get_callbacks())
    is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
    progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
    if progress_bar_refresh_rate is None:
        progress_bar_refresh_rate = 50
        logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
                     f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
    if is_azureml_run:
        callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate,
                                            write_to_logging_info=True,
                                            print_timestamp=False))
    else:
        callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate))
    # Read out additional model-specific args here.
    # We probably want to keep essential ones like numgpu and logging.
    trainer = Trainer(default_root_dir=str(container.outputs_folder),
                      deterministic=deterministic,
                      benchmark=benchmark,
                      accelerator=accelerator,
                      strategy=strategy,
                      max_epochs=container.num_epochs,
                      # Both these arguments can be integers or floats. If integers, it is the number of batches.
                      # If float, it's the fraction of batches. We default to 1.0 (processing all batches).
                      limit_train_batches=container.pl_limit_train_batches or 1.0,
                      limit_val_batches=container.pl_limit_val_batches or 1.0,
                      num_sanity_val_steps=container.pl_num_sanity_val_steps,
                      check_val_every_n_epoch=container.pl_check_val_every_n_epoch,
                      callbacks=callbacks,
                      logger=loggers,
                      num_nodes=num_nodes,
                      devices=devices,
                      precision=precision,
                      sync_batchnorm=True,
                      detect_anomaly=container.detect_anomaly,
                      profiler=container.pl_profiler,
                      resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
                      multiple_trainloader_mode=multiple_trainloader_mode,
                      **additional_args)
    return trainer, storing_logger
Ejemplo n.º 19
0
def create_lightning_trainer(container: LightningContainer,
                             resume_from_checkpoint: Optional[Path] = None,
                             num_nodes: int = 1,
                             **kwargs: Dict[str, Any]) -> \
        Tuple[Trainer, Optional[StoringLogger]]:
    """
    Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
    and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
    return value.
    :param container: The container with model and data.
    :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
    :param num_nodes: The number of nodes to use in distributed training.
    :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
    :return: A tuple [Trainer object, diagnostic logger]
    """
    # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
    # models, this still appears to be the best way of choosing them because validation loss on the relatively small
    # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
    # not for the HeadAndNeck model.
    best_checkpoint_callback = ModelCheckpoint(
        dirpath=str(container.checkpoint_folder),
        # filename=BEST_CHECKPOINT_FILE_NAME,
        # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}",
        # save_top_k=1,
        save_last=True)

    # Recovery checkpoints: {epoch} will turn into a string like "epoch=1"
    # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs, keep the last
    # recovery_checkpoints_save_last_k.
    recovery_checkpoint_callback = InnerEyeRecoveryCheckpointCallback(
        container)

    num_gpus = container.num_gpus_per_node
    effective_num_gpus = num_gpus * num_nodes
    # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
    # For unit tests, only "ddp_spawn" works
    accelerator = "ddp" if effective_num_gpus > 1 else None
    if effective_num_gpus > 1:
        # Initialize the DDP plugin with find_unused_parameters=False by default. If True (default), it prints out
        # lengthy warnings about the performance impact of find_unused_parameters
        plugins = [
            InnerEyeDDPPlugin(
                num_nodes=num_nodes,
                sync_batchnorm=True,
                find_unused_parameters=container.pl_find_unused_parameters)
        ]
    else:
        plugins = []
    logging.info(
        f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'")
    tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder),
                                           name="Lightning",
                                           version="")
    loggers = [tensorboard_logger, AzureMLLogger()]
    storing_logger: Optional[StoringLogger]
    if isinstance(container, InnerEyeContainer):
        storing_logger = StoringLogger()
        loggers.append(storing_logger)
    else:
        storing_logger = None
    # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
    precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
    # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
    # https://pytorch.org/docs/stable/notes/randomness.html
    # For the classification models, we observed only a small performance deterioration (increase in 10sec on total
    # training time of 22min) when switching to deterministic.
    if container.pl_deterministic:
        deterministic = True
        benchmark = False
    else:
        deterministic = False
        benchmark = True
    # If the users provides additional callbacks via get_trainer_arguments (for custom
    # containers
    callbacks = [best_checkpoint_callback, recovery_checkpoint_callback]
    if "callbacks" in kwargs:
        callbacks.append(kwargs.pop("callbacks"))  # type: ignore
    is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
    progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
    if progress_bar_refresh_rate is None and is_azureml_run:
        # When running in AzureML, the default progress bar clutters the output files with thousands of lines.
        progress_bar_refresh_rate = 50
        logging.info(
            f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
            f"To change, modify the pl_progress_bar_refresh_rate field of the container."
        )
    # Read out additional model-specific args here.
    # We probably want to keep essential ones like numgpu and logging.
    trainer = Trainer(default_root_dir=str(container.outputs_folder),
                      deterministic=deterministic,
                      benchmark=benchmark,
                      accelerator=accelerator,
                      max_epochs=container.num_epochs,
                      num_sanity_val_steps=container.pl_num_sanity_val_steps,
                      callbacks=callbacks,
                      logger=loggers,
                      progress_bar_refresh_rate=progress_bar_refresh_rate,
                      num_nodes=num_nodes,
                      gpus=num_gpus,
                      precision=precision,
                      sync_batchnorm=True,
                      terminate_on_nan=container.detect_anomaly,
                      resume_from_checkpoint=str(resume_from_checkpoint)
                      if resume_from_checkpoint else None,
                      plugins=plugins,
                      **kwargs)
    return trainer, storing_logger
Ejemplo n.º 20
0
 def is_offline_run(self) -> bool:
     """
     Returns True if the run is executing outside AzureML, or False if inside AzureML.
     """
     return is_offline_run_context(RUN_CONTEXT)
Ejemplo n.º 21
0
def model_train(checkpoint_handler: CheckpointHandler,
                container: LightningContainer,
                num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]:
    """
    The main training loop. It creates the Pytorch model based on the configuration options passed in,
    creates a Pytorch Lightning trainer, and trains the model.
    If a checkpoint was specified, then it loads the checkpoint before resuming training.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
    :param num_nodes: The number of nodes to use in distributed training.
    :param container: A container object that holds the training data in PyTorch Lightning format
    and the model to train.
    :return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting
    the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when
    fitting other models.
    """
    # Get the path to the checkpoint to recover from
    checkpoint_path = checkpoint_handler.get_recovery_path_train()
    lightning_model = container.model

    resource_monitor: Optional[ResourceMonitor] = None
    # Execute some bookkeeping tasks only once if running distributed:
    if is_global_rank_zero():
        logging.info(
            f"Model checkpoints are saved at {container.checkpoint_folder}")
        write_args_file(container.config if isinstance(
            container, InnerEyeContainer) else container,
                        outputs_folder=container.outputs_folder)
        if container.monitoring_interval_seconds > 0:
            resource_monitor = start_resource_monitor(container)

    # Run all of the container-related operations consistently with changed outputs folder, even ones that
    # should not rely on the current working directory, like get_data_module.
    with change_working_directory(container.outputs_folder):
        data_module = container.get_data_module()
        if is_global_rank_zero():
            container.before_training_on_global_rank_zero()
        if is_local_rank_zero():
            container.before_training_on_local_rank_zero()
        container.before_training_on_all_ranks()

    # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second
    # training in the unit tests.d
    old_environ = dict(os.environ)
    # Set random seeds just before training. For segmentation models, we have
    # something that changes the random seed in the before_training_on_rank_zero hook.
    seed_everything(container.get_effective_random_seed())
    trainer, storing_logger = create_lightning_trainer(
        container,
        checkpoint_path,
        num_nodes=num_nodes,
        **container.get_trainer_arguments())
    rank_info = ", ".join(
        f"{env}: {os.getenv(env)}"
        for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK])
    logging.info(
        f"Environment variables: {rank_info}. trainer.global_rank: {trainer.global_rank}"
    )
    # InnerEye models use this logger for diagnostics
    if isinstance(lightning_model, InnerEyeLightning):
        if storing_logger is None:
            raise ValueError(
                "InnerEye models require the storing_logger for diagnostics")
        lightning_model.storing_logger = storing_logger

    logging.info("Starting training")
    # When training models that are not built-in InnerEye models, we have no guarantee that they write
    # files to the right folder. Best guess is to change the current working directory to where files should go.
    with change_working_directory(container.outputs_folder):
        trainer.fit(lightning_model, datamodule=data_module)
        trainer.logger.close()  # type: ignore
    world_size = getattr(trainer, "world_size", 0)
    is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
    # Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
    # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them.
    if is_azureml_run and world_size > 1 and isinstance(
            lightning_model, ScalarLightning):
        upload_output_file_as_temp(
            lightning_model.train_subject_outputs_logger.csv_path,
            container.outputs_folder)
        upload_output_file_as_temp(
            lightning_model.val_subject_outputs_logger.csv_path,
            container.outputs_folder)
    # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training.
    # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set
    # all necessary properties.
    if lightning_model.global_rank != 0:
        logging.info(
            f"Terminating training thread with rank {lightning_model.global_rank}."
        )
        sys.exit()

    logging.info("Choosing the best checkpoint and removing redundant files.")
    create_best_checkpoint(container.checkpoint_folder)
    # Lightning modifies a ton of environment variables. If we first run training and then the test suite,
    # those environment variables will mislead the training runs in the test suite, and make them crash.
    # Hence, restore the original environment after training.
    os.environ.clear()
    os.environ.update(old_environ)

    if world_size and isinstance(lightning_model, ScalarLightning):
        if is_azureml_run and world_size > 1:
            # In a DDP run on the local box, all ranks will write to local disk, hence no download needed.
            # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and
            # concatenate
            for rank in range(world_size):
                for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
                    file = mode.value + "/" + get_subject_output_file_per_rank(
                        rank)
                    RUN_CONTEXT.download_file(
                        name=TEMP_PREFIX + file,
                        output_file_path=container.outputs_folder / file)
        # Concatenate all temporary file per execution mode
        aggregate_and_create_subject_metrics_file(container.outputs_folder)

    logging.info("Finished training")

    # Since we have trained the model further, let the checkpoint_handler object know so it can handle
    # checkpoints correctly.
    checkpoint_handler.additional_training_done()

    # Upload visualization directory to AML run context to be able to see it in the Azure UI.
    if isinstance(container, InnerEyeContainer):
        if container.config.max_batch_grad_cam > 0 and container.visualization_folder.exists(
        ):
            RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER,
                                      path=str(container.visualization_folder))

    if resource_monitor:
        logging.info("Shutting down the resource monitor process.")
        if is_azureml_run:
            for gpu_name, metrics_per_gpu in resource_monitor.read_aggregate_metrics(
            ).items():
                # Log as a table, with GPU being the first column
                RUN_CONTEXT.log_row("GPU utilization",
                                    GPU=gpu_name,
                                    **metrics_per_gpu)
        resource_monitor.kill()

    return trainer, storing_logger