def set_multiprocessing_start_method(self) -> None:
     """
     Set the (PyTorch) multiprocessing start method.
     """
     method = self.model_config.multiprocessing_start_method
     if is_windows():
         if method != MultiprocessingStartMethod.spawn:
             logging.warning(f"Cannot set multiprocessing start method to '{method.name}' "
                             "because only 'spawn' is available in Windows")
     else:
         logging.info(f"Setting multiprocessing start method to '{method.name}'")
         torch.multiprocessing.set_start_method(method.name, force=True)
Example #2
0
                              combine_hidden_states=combine_hidden_state,
                              use_encoder_layer_norm=use_encoder_layer_norm,
                              use_mean_teacher_model=use_mean_teacher_model,
                              should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.dataset_data_frame = _get_mock_sequence_dataset()
    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, SCAN_SIZE),
                                                      segmentations=np.random.randint(0, 2, SCAN_SIZE))
    with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
        results = model_train(config)
        assert len(results.optimal_temperature_scale_values_per_checkpoint_epoch) \
               == config.get_total_number_of_save_epochs()


@pytest.mark.skipif(common_util.is_windows(), reason="Has issues on windows build")
@pytest.mark.parametrize(["use_combined_model", "imaging_feature_type"],
                         [(False, ImagingFeatureType.Image),
                          (True, ImagingFeatureType.Image),
                          (True, ImagingFeatureType.Segmentation),
                          (True, ImagingFeatureType.ImageAndSegmentation)])
def test_run_ml_with_sequence_model(use_combined_model: bool,
                                    imaging_feature_type: ImagingFeatureType,
                                    test_output_dirs: TestOutputDirectories) -> None:
    """
    Test training and testing of sequence models, when it is started together via run_ml.
    """
    logging_to_stdout()
    config = ToySequenceModel(use_combined_model, imaging_feature_type,
                              should_validate=False, sequence_target_positions=[2, 10])
    config.set_output_to(test_output_dirs.root_dir)
def test_train_classification_model(
        class_name: str, test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of classification models, asserting on the individual results from training and
    testing.
    Expected test results are stored for GPU with and without mixed precision.
    """
    logging_to_stdout(logging.DEBUG)
    config = ClassificationModelForTesting()
    config.class_names = [class_name]
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=Path(test_output_dirs.root_dir))
    # Train for 4 epochs, checkpoints at epochs 2 and 4
    config.num_epochs = 4
    model_training_result = model_training.model_train(
        config, checkpoint_handler=checkpoint_handler)
    assert model_training_result is not None
    expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
    expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
    expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
    # Ensure that all metrics are computed on both training and validation set
    assert len(
        model_training_result.train_results_per_epoch) == config.num_epochs
    assert len(
        model_training_result.val_results_per_epoch) == config.num_epochs
    assert len(model_training_result.train_results_per_epoch[0]) >= 11
    assert len(model_training_result.val_results_per_epoch[0]) >= 11

    for metric in [
            MetricType.ACCURACY_AT_THRESHOLD_05,
            MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
            MetricType.AREA_UNDER_PR_CURVE, MetricType.AREA_UNDER_ROC_CURVE,
            MetricType.CROSS_ENTROPY, MetricType.LOSS,
            MetricType.SECONDS_PER_BATCH, MetricType.SECONDS_PER_EPOCH,
            MetricType.SUBJECT_COUNT
    ]:
        assert metric.value in model_training_result.train_results_per_epoch[0], \
            f"{metric.value} not in training"
        assert metric.value in model_training_result.val_results_per_epoch[0], \
            f"{metric.value} not in validation"

    actual_train_loss = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LOSS.value)
    actual_val_loss = model_training_result.get_metric(
        is_training=False, metric_type=MetricType.LOSS.value)
    actual_lr = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LEARNING_RATE.value)
    assert actual_train_loss == pytest.approx(expected_train_loss,
                                              abs=1e-6), "Training loss"
    assert actual_val_loss == pytest.approx(expected_val_loss,
                                            abs=1e-6), "Validation loss"
    assert actual_lr == pytest.approx(expected_learning_rates,
                                      rel=1e-5), "Learning rates"
    test_results = model_testing.model_test(
        config,
        ModelExecutionMode.TRAIN,
        checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    expected_metrics = [0.636085, 0.735952]
    assert test_results.metrics.values(class_name)[MetricType.CROSS_ENTROPY.value] == \
           pytest.approx(expected_metrics, abs=1e-5)
    # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here
    # we want to mostly assert that the files look reasonable
    if machine_has_gpu:
        return

    # Check epoch_metrics.csv
    epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME
    # Auto-format will break the long header line, hence the strange way of writing it!
    expected_epoch_metrics = \
        f"{LoggingColumns.Loss.value},{LoggingColumns.CrossEntropy.value}," \
        f"{LoggingColumns.AccuracyAtThreshold05.value},{LoggingColumns.LearningRate.value}," + \
        f"{LoggingColumns.AreaUnderRocCurve.value}," \
        f"{LoggingColumns.AreaUnderPRCurve.value}," \
        f"{LoggingColumns.AccuracyAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalsePositiveRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalseNegativeRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.OptimalThreshold.value}," \
        f"{LoggingColumns.SubjectCount.value},{LoggingColumns.Epoch.value}," \
        f"{LoggingColumns.CrossValidationSplitIndex.value}\n" + \
        """0.6866141557693481,0.6866141557693481,0.5,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1	
        0.6864652633666992,0.6864652633666992,0.5,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1	
        0.6863163113594055,0.6863162517547607,0.5,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1	
        0.6861673593521118,0.6861673593521118,0.5,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1	
        """
    check_log_file(epoch_metrics_path,
                   expected_epoch_metrics,
                   ignore_columns=[])
    # Check metrics.csv: This contains the per-subject per-epoch model outputs
    # Randomization comes out slightly different on Windows, hence only execute the test on Linux
    if common_util.is_windows():
        return
    metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME
    metrics_expected = \
        f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
0,S2,{class_name},0.529514,1,Train,-1
0,S4,{class_name},0.521659,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
1,S2,{class_name},0.529475,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
2,S2,{class_name},0.529437,1,Train,-1
3,S2,{class_name},0.529399,1,Train,-1
3,S4,{class_name},0.521128,0,Train,-1
"""
    check_log_file(metrics_path, metrics_expected, ignore_columns=[])
    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
    # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
    inference_metrics_path = config.outputs_folder / get_epoch_results_path(ModelExecutionMode.TRAIN) / \
                             SUBJECT_METRICS_FILE_NAME
    inference_metrics_expected = \
        f"""prediction_target,subject,model_output,label,cross_validation_split_index,data_split
{class_name},S2,0.5293986201286316,1.0,-1,Train
{class_name},S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(inference_metrics_path,
                   inference_metrics_expected,
                   ignore_columns=[])
Example #4
0

def compare_files(actual: List[Path], expected: List[str]) -> None:
    assert len(actual) == len(expected)
    for (f, e) in zip(actual, expected):
        assert f.exists()
        full_expected = full_ml_test_data_path(e)
        assert full_expected.exists()
        assert str(f).endswith(e)
        # To update the stored results, uncomment this line:
        # full_expected.write_bytes(f.read_bytes())
        assert file_as_bytes(f) == file_as_bytes(full_expected)


@pytest.mark.skipif(
    common_util.is_windows(),
    reason="Rendering of the graph is slightly different on Linux")
def test_plot_normalization_result(
        test_output_dirs: TestOutputDirectories) -> None:
    """
    Tests plotting of before/after histograms in photometric normalization.
    :return:
    """
    size = (3, 3, 3)
    image = np.zeros((1, ) + size)
    for i, (z, y, x) in enumerate(
            itertools.product(range(size[0]), range(size[1]), range(size[2]))):
        image[0, z, y, x] = i
    labels = np.zeros((2, ) + size)
    labels[1, 1, 1, 1] = 1
    sample = Sample(image=image,
import pandas as pd
import pytest

from InnerEye.Common.common_util import is_windows
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
from InnerEye.ML.dataset.scalar_sample import ScalarItem
from InnerEye.ML.scalar_config import ScalarModelBase
from InnerEye.ML.utils import ml_util


@pytest.mark.parametrize("num_dataload_workers", [0, 1])
@pytest.mark.parametrize("shuffle", [False, True])
@pytest.mark.skipif(
    is_windows(),
    reason=
    "This test runs fine on local Windows boxes, but leads to odd timeouts in Azure"
)
def test_dataloader_speed(test_output_dirs: OutputFolderForTests,
                          num_dataload_workers: int, shuffle: bool) -> None:
    """
    Test how dataloaders work when using multiple processes.
    """
    ml_util.set_random_seed(0)
    # The dataset should only contain the file name stem, without extension.
    csv_string = StringIO("""subject,channel,path,value,scalar1
S1,image,4be9beed-5861-fdd2-72c2-8dd89aadc1ef
S1,label,,True,1.0
S2,image,6ceacaf8-abd2-ffec-2ade-d52afd6dd1be
S2,label,,True,2.0
Example #6
0
        expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file]

    checkpoint_paths = run_recovery.get_checkpoint_paths(1)
    if is_ensemble:
        assert len(run_recovery.checkpoints_roots) == len(expected_files)
        assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots])
        assert len(checkpoint_paths) == len(expected_files)
        assert all([x in expected_files for x in checkpoint_paths])
    else:
        assert len(checkpoint_paths) == 1
        assert checkpoint_paths[0] == expected_files[0]

    assert all([expected_file.exists() for expected_file in expected_files])


@pytest.mark.skipif(common_util.is_windows(), reason="Has issues on the windows build")
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests,
                                             runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID))
    # recover child runs separately also to test hyperdrive child run recovery functionality
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    for child in child_runs:
        expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file]
        run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child)
        assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)])
        assert all([expected_file.exists() for expected_file in expected_files])
Example #7
0
class WorkflowParams(param.Parameterized):
    """
    This class contains all parameters that affect how the whole training and testing workflow is executed.
    """
    random_seed: int = param.Integer(
        42, doc="The seed to use for all random number generators.")
    number_of_cross_validation_splits: int = param.Integer(
        0,
        bounds=(0, None),
        doc="Number of cross validation splits for k-fold cross "
        "validation")
    cross_validation_split_index: int = param.Integer(
        DEFAULT_CROSS_VALIDATION_SPLIT_INDEX,
        bounds=(-1, None),
        doc="The index of the cross validation fold this model is "
        "associated with when performing k-fold cross validation")
    perform_training_set_inference: bool = \
        param.Boolean(False,
                      doc="If True, run full image inference on the training set at the end of training. If False and "
                          "perform_validation_and_test_set_inference is True (default), only run inference on "
                          "validation and test set. If both flags are False do not run inference.")
    perform_validation_and_test_set_inference: bool = \
        param.Boolean(True,
                      doc="If True (default), run full image inference on validation and test set after training.")
    weights_url: str = param.String(
        doc=
        "If provided, a url from which weights will be downloaded and used for model "
        "initialization.")
    local_weights_path: Optional[Path] = param.ClassSelector(
        class_=Path,
        default=None,
        allow_None=True,
        doc="The path to the weights to use for model "
        "initialization, when training outside AzureML.")
    generate_report: bool = param.Boolean(
        default=True,
        doc=
        "If True (default), write a modelling report in HTML format. If False,"
        "do not write that report.")
    # The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
    # "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
    # can reduce the chance of stuck jobs.
    multiprocessing_start_method: MultiprocessingStartMethod = \
        param.ClassSelector(class_=MultiprocessingStartMethod,
                            default=(MultiprocessingStartMethod.spawn if is_windows()
                                     else MultiprocessingStartMethod.fork),
                            doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
                                "fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
                                "Set to forkserver as a possible remedy for stuck jobs.")
    monitoring_interval_seconds: int = param.Integer(
        0,
        doc="Seconds delay between logging GPU/CPU resource "
        "statistics. If 0 or less, do not log any resource "
        "statistics.")

    def validate(self) -> None:
        if self.weights_url and self.local_weights_path:
            raise ValueError(
                "Cannot specify both local_weights_path and weights_url.")

        if self.number_of_cross_validation_splits == 1:
            raise ValueError(
                "At least two splits required to perform cross validation, but got "
                f"{self.number_of_cross_validation_splits}. To train without cross validation, set "
                "number_of_cross_validation_splits=0.")
        if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
            raise ValueError(
                f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
                f"which is invalid for CV with {self.number_of_cross_validation_splits} splits."
            )
        elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
            raise ValueError(
                f"Cross validation split index must be -1 for a non cross validation run, "
                f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
                f"and cross_validation_split_index={self.cross_validation_split_index}"
            )

    @property
    def is_offline_run(self) -> bool:
        """
        Returns True if the run is executing outside AzureML, or False if inside AzureML.
        """
        return is_offline_run_context(RUN_CONTEXT)

    @property
    def perform_cross_validation(self) -> bool:
        """
        True if cross validation will be be performed as part of the training procedure.
        :return:
        """
        return self.number_of_cross_validation_splits > 1

    def get_effective_random_seed(self) -> int:
        """
        Returns the random seed set as part of this configuration. If the configuration corresponds
        to a cross validation split, then the cross validation fold index will be added to the
        set random seed in order to return the effective random seed.
        :return:
        """
        seed = self.random_seed
        if self.perform_cross_validation:
            # offset the random seed based on the cross validation split index so each
            # fold has a different initial random state.
            seed += self.cross_validation_split_index
        return seed
Example #8
0
        use_mixed_precision=True)
    lightning_model = create_lightning_model(config)
    assert isinstance(lightning_model, SegmentationLightning)
    pipeline = InferencePipeline(model=lightning_model, model_config=config)
    image = np.random.uniform(-1, 1, (1, ) + size)
    result = pipeline.predict_and_post_process_whole_image(
        image, mask=np.ones(size), voxel_spacing_mm=(1, 1, 1))
    # All posteriors and segmentations must have the size of the input image
    for p in [*result.posteriors, result.segmentation]:
        assert p.shape == size
        # Check that all results are not NaN. In particular, if stride size is not adjusted
        # correctly, the results would be partially NaN.
        image_util.check_array_range(p)


@pytest.mark.skipif(is_windows(), reason="Too slow on windows")
def test_inference_on_too_small_image() -> None:
    """
    Running inference on a simplified Unet model when the input image is too small along an axis.
    """
    with pytest.raises(ValueError) as ex:
        run_inference_on_unet((5, 10, 64))
    assert "input image must have at least a size of (16, 16, 16)" in str(ex)


@pytest.mark.skipif(is_windows(), reason="Too slow on windows")
@pytest.mark.parametrize("size", [(26, 20, 50), (16, 16, 16)])
def test_inference_on_small_image(size: TupleInt3) -> None:
    """
    Test case for a failure at test time: Inference failed when
    the image was smaller than the test_crop_size. Try with different size, one that has
Example #9
0
def test_train_classification_model(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of classification models, asserting on the individual results from training and
    testing.
    Expected test results are stored for GPU with and without mixed precision.
    """
    logging_to_stdout(logging.DEBUG)
    config = ClassificationModelForTesting()
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=Path(test_output_dirs.root_dir))
    # Train for 4 epochs, checkpoints at epochs 2 and 4
    config.num_epochs = 4
    model_training_result = model_training.model_train(
        config, checkpoint_handler=checkpoint_handler)
    assert model_training_result is not None
    expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
    expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
    expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
    # Ensure that all metrics are computed on both training and validation set
    assert len(
        model_training_result.train_results_per_epoch) == config.num_epochs
    assert len(
        model_training_result.val_results_per_epoch) == config.num_epochs
    assert len(model_training_result.train_results_per_epoch[0]) >= 11
    assert len(model_training_result.val_results_per_epoch[0]) >= 11
    for metric in [
            MetricType.ACCURACY_AT_THRESHOLD_05,
            MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
            MetricType.AREA_UNDER_PR_CURVE,
            MetricType.AREA_UNDER_ROC_CURVE,
            MetricType.CROSS_ENTROPY,
            MetricType.LOSS,
            # For unknown reasons, we don't get seconds_per_batch for the training data.
            # MetricType.SECONDS_PER_BATCH,
            MetricType.SECONDS_PER_EPOCH,
            MetricType.SUBJECT_COUNT,
    ]:
        assert metric.value in model_training_result.train_results_per_epoch[
            0], f"{metric.value} not in training"
        assert metric.value in model_training_result.val_results_per_epoch[
            0], f"{metric.value} not in validation"
    actual_train_loss = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LOSS.value)
    actual_val_loss = model_training_result.get_metric(
        is_training=False, metric_type=MetricType.LOSS.value)
    actual_lr = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LEARNING_RATE.value)
    assert actual_train_loss == pytest.approx(expected_train_loss,
                                              abs=1e-6), "Training loss"
    assert actual_val_loss == pytest.approx(expected_val_loss,
                                            abs=1e-6), "Validation loss"
    assert actual_lr == pytest.approx(expected_learning_rates,
                                      rel=1e-5), "Learning rates"
    test_results = model_testing.model_test(
        config,
        ModelExecutionMode.TRAIN,
        checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    expected_metrics = [0.636085, 0.735952]
    assert test_results.metrics.values()[MetricType.CROSS_ENTROPY.value] == \
           pytest.approx(expected_metrics, abs=1e-5)
    # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here
    # we want to mostly assert that the files look reasonable
    if machine_has_gpu:
        return
    # Check epoch_metrics.csv
    epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME
    # Auto-format will break the long header line, hence the strange way of writing it!
    expected_epoch_metrics = \
        "loss,cross_entropy,accuracy_at_threshold_05,seconds_per_epoch,learning_rate," + \
        "area_under_roc_curve,area_under_pr_curve,accuracy_at_optimal_threshold," \
        "false_positive_rate_at_optimal_threshold,false_negative_rate_at_optimal_threshold," \
        "optimal_threshold,subject_count,epoch,cross_validation_split_index\n" + \
        """0.6866141557693481,0.6866141557693481,0.5,0,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1	
        0.6864652633666992,0.6864652633666992,0.5,0,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1	
        0.6863163113594055,0.6863162517547607,0.5,0,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1	
        0.6861673593521118,0.6861673593521118,0.5,0,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1	
        """
    # We cannot compare columns like "seconds_per_epoch" because timing will obviously vary between machines.
    # Column must still be present, though.
    check_log_file(epoch_metrics_path,
                   expected_epoch_metrics,
                   ignore_columns=[LoggingColumns.SecondsPerEpoch.value])
    # Check metrics.csv: This contains the per-subject per-epoch model outputs
    # Randomization comes out slightly different on Windows, hence only execute the test on Linux
    if common_util.is_windows():
        return
    metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME
    metrics_expected = \
        """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split
Default,0,S2,0.5295137763023376,1.0,-1,Train
Default,0,S4,0.5216594338417053,0.0,-1,Train
Default,1,S4,0.5214819312095642,0.0,-1,Train
Default,1,S2,0.5294750332832336,1.0,-1,Train
Default,2,S2,0.5294366478919983,1.0,-1,Train
Default,2,S4,0.5213046073913574,0.0,-1,Train
Default,3,S2,0.5293986201286316,1.0,-1,Train
Default,3,S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(metrics_path, metrics_expected, ignore_columns=[])
    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
    # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
    inference_metrics_path = config.outputs_folder / get_epoch_results_path(ModelExecutionMode.TRAIN) / \
                             SUBJECT_METRICS_FILE_NAME
    inference_metrics_expected = \
        """prediction_target,subject,model_output,label,cross_validation_split_index,data_split
Default,S2,0.5293986201286316,1.0,-1,Train
Default,S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(inference_metrics_path,
                   inference_metrics_expected,
                   ignore_columns=[])
Example #10
0
class DeepLearningConfig(GenericConfig, CudaAwareConfig):
    """
    A class that holds all settings that are shared across segmentation models and regression/classification models.
    """
    _model_category: ModelCategory = param.ClassSelector(
        class_=ModelCategory,
        doc="The high-level model category described by this config.")
    _model_name: str = param.String(
        None,
        doc="The human readable name of the model (for example, Liver). This is "
        "usually set from the class name.")

    random_seed: int = param.Integer(
        42, doc="The seed to use for all random number generators.")
    azure_dataset_id: str = param.String(
        doc=
        "If provided, the ID of the dataset to use. This dataset must exist as a "
        "folder of the same name in the 'datasets' "
        "container in the datasets storage account.")
    local_dataset: Optional[Path] = param.ClassSelector(
        class_=Path,
        default=None,
        allow_None=True,
        doc="The path of the dataset to use, when training is running "
        "outside Azure.")
    num_dataload_workers: int = param.Integer(
        8,
        bounds=(0, None),
        doc="The number of data loading workers (processes). When set to 0,"
        "data loading is running in the same process (no process startup "
        "cost, hence good for use in unit testing. However, it "
        "does not give the same result as running with 1 worker process)")
    shuffle: bool = param.Boolean(
        True,
        doc="If true, the dataset will be shuffled randomly during training.")
    num_epochs: int = param.Integer(100,
                                    bounds=(1, None),
                                    doc="Number of epochs to train.")
    start_epoch: int = param.Integer(
        0,
        bounds=(0, None),
        doc="The first epoch to train. Set to 0 to start a new "
        "training. Set to a value larger than zero for starting"
        " from a checkpoint.")

    l_rate: float = param.Number(1e-4,
                                 doc="The initial learning rate",
                                 bounds=(0, None))
    _min_l_rate: float = param.Number(
        0.0,
        doc=
        "The minimum learning rate for the Polynomial and Cosine schedulers.",
        bounds=(0.0, None))
    l_rate_scheduler: LRSchedulerType = param.ClassSelector(
        default=LRSchedulerType.Polynomial,
        class_=LRSchedulerType,
        instantiate=False,
        doc="Learning rate decay method (Cosine, Polynomial, "
        "Step, MultiStep or Exponential)")
    l_rate_exponential_gamma: float = param.Number(
        0.9,
        doc="Controls the rate of decay for the Exponential "
        "LR scheduler.")
    l_rate_step_gamma: float = param.Number(
        0.1, doc="Controls the rate of decay for the "
        "Step LR scheduler.")
    l_rate_step_step_size: int = param.Integer(
        50, bounds=(0, None), doc="The step size for Step LR scheduler")
    l_rate_multi_step_gamma: float = param.Number(
        0.1,
        doc="Controls the rate of decay for the "
        "MultiStep LR scheduler.")
    l_rate_multi_step_milestones: Optional[List[int]] = param.List(
        None,
        bounds=(1, None),
        allow_None=True,
        class_=int,
        doc="The milestones for MultiStep decay.")
    l_rate_polynomial_gamma: float = param.Number(
        1e-4,
        doc="Controls the rate of decay for the "
        "Polynomial LR scheduler.")
    l_rate_warmup: LRWarmUpType = param.ClassSelector(
        default=LRWarmUpType.NoWarmUp,
        class_=LRWarmUpType,
        instantiate=False,
        doc="The type of learning rate warm up to use. "
        "Can be NoWarmUp (default) or Linear.")
    l_rate_warmup_epochs: int = param.Integer(
        0,
        bounds=(0, None),
        doc="Number of warmup epochs (linear warmup) before the "
        "scheduler starts decaying the learning rate. "
        "For example, if you are using MultiStepLR with "
        "milestones [50, 100, 200] and warmup epochs = 100, warmup "
        "will last for 100 epochs and the first decay of LR "
        "will happen on epoch 150")
    optimizer_type: OptimizerType = param.ClassSelector(
        default=OptimizerType.Adam,
        class_=OptimizerType,
        instantiate=False,
        doc="The optimizer_type to use")
    opt_eps: float = param.Number(
        1e-4, doc="The epsilon parameter of RMSprop or Adam")
    rms_alpha: float = param.Number(0.9, doc="The alpha parameter of RMSprop")
    adam_betas: TupleFloat2 = param.NumericTuple(
        (0.9, 0.999),
        length=2,
        doc="The betas parameter of Adam, default is (0.9, 0.999)")
    momentum: float = param.Number(
        0.6, doc="The momentum parameter of the optimizers")
    weight_decay: float = param.Number(
        1e-4, doc="The weight decay used to control L2 regularization")

    save_start_epoch: int = param.Integer(
        100,
        bounds=(0, None),
        doc="Save epoch checkpoints only when epoch is "
        "larger or equal to this value.")
    save_step_epochs: int = param.Integer(
        50,
        bounds=(0, None),
        doc="Save epoch checkpoints when epoch number is a "
        "multiple of save_step_epochs")
    train_batch_size: int = param.Integer(
        4,
        bounds=(0, None),
        doc="The number of crops that make up one minibatch during training.")
    detect_anomaly: bool = param.Boolean(
        False,
        doc="If true, test gradients for anomalies (NaN or Inf) during "
        "training.")
    use_mixed_precision: bool = param.Boolean(
        False,
        doc="If true, mixed precision training is activated during "
        "training.")
    use_model_parallel: bool = param.Boolean(
        False,
        doc="If true, neural network model is partitioned across all "
        "available GPUs to fit in a large model. It shall not be used "
        "together with data parallel.")
    test_diff_epochs: Optional[int] = param.Integer(
        None,
        doc="Number of different epochs of the same model to test",
        allow_None=True)
    test_step_epochs: Optional[int] = param.Integer(
        None, doc="How many epochs to move for each test", allow_None=True)
    test_start_epoch: Optional[int] = param.Integer(
        None,
        doc="The first epoch on which testing should run.",
        allow_None=True)
    monitoring_interval_seconds: int = param.Integer(
        0,
        doc="Seconds delay between logging GPU/CPU resource "
        "statistics. If 0 or less, do not log any resource "
        "statistics.")
    number_of_cross_validation_splits: int = param.Integer(
        0,
        bounds=(0, None),
        doc="Number of cross validation splits for k-fold cross "
        "validation")
    cross_validation_split_index: int = param.Integer(
        DEFAULT_CROSS_VALIDATION_SPLIT_INDEX,
        bounds=(-1, None),
        doc="The index of the cross validation fold this model is "
        "associated with when performing k-fold cross validation")
    file_system_config: DeepLearningFileSystemConfig = param.ClassSelector(
        default=DeepLearningFileSystemConfig(),
        class_=DeepLearningFileSystemConfig,
        instantiate=False,
        doc="File system related configs")
    pin_memory: bool = param.Boolean(
        True, doc="Value of pin_memory argument to DataLoader")
    _overrides: Dict[str, Any] = param.Dict(
        instantiate=True,
        doc="Model config properties that were overridden from the commandline"
    )
    restrict_subjects: Optional[str] = \
        param.String(doc="Use at most this number of subjects for train, val, or test set (must be > 0 or None). "
                         "If None, do not modify the train, val, or test sets. If a string of the form 'i,j,k' where "
                         "i, j and k are integers, modify just the corresponding sets (i for train, j for val, k for "
                         "test). If any of i, j or j are missing or are negative, do not modify the corresponding "
                         "set. Thus a value of 20,,5 means limit training set to 20, keep validation set as is, and "
                         "limit test set to 5. If any of i,j,k is '+', discarded members of the other sets are added "
                         "to that set.",
                     allow_None=True)
    perform_training_set_inference: bool = \
        param.Boolean(False,
                      doc="If False (default), run full image inference on validation and test set after training. If "
                          "True, also run full image inference on the training set")
    perform_validation_and_test_set_inference: bool = \
        param.Boolean(True,
                      doc="If True (default), run full image inference on validation and test set after training.")
    _metrics_data_frame_loggers: MetricsDataframeLoggers = param.ClassSelector(
        default=None,
        class_=MetricsDataframeLoggers,
        instantiate=False,
        doc="Data frame loggers for this model "
        "config")
    _dataset_data_frame: Optional[DataFrame] = \
        param.DataFrame(default=None,
                        doc="The dataframe that contains the dataset for the model. This is usually read from disk "
                            "from dataset.csv")
    _use_gpu: Optional[bool] = param.Boolean(
        None,
        doc="If true, a CUDA capable GPU with at least 1 device is "
        "available. If None, the use_gpu property has not yet been called.")
    avoid_process_spawn_in_data_loaders: bool = \
        param.Boolean(is_windows(), doc="If True, use a data loader logic that avoid spawning new processes at the "
                                        "start of each epoch. This speeds up training on both Windows and Linux, but"
                                        "on Linux, inference is currently disabled as the data loaders hang. "
                                        "If False, use the default data loader logic that starts new processes for "
                                        "each epoch.")
    # The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
    # "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
    # can reduce the chance of stuck jobs.
    multiprocessing_start_method: MultiprocessingStartMethod = \
        param.ClassSelector(class_=MultiprocessingStartMethod,
                            default=(MultiprocessingStartMethod.spawn if is_windows()
                                     else MultiprocessingStartMethod.fork),
                            doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
                                "fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
                                "Set to forkserver as a possible remedy for stuck jobs.")
    output_to: Optional[str] = \
        param.String(default=None,
                     doc="If provided, the run outputs will be written to the given folder. If not provided, outputs "
                         "will go into a subfolder of the project root folder.")
    max_batch_grad_cam: int = param.Integer(
        default=0,
        doc="Max number of validation batches for which "
        "to save gradCam images. By default "
        "visualizations are saved for all images "
        "in the validation set")
    label_smoothing_eps: float = param.Number(
        0.0,
        bounds=(0.0, 1.0),
        doc="Target smoothing value for label smoothing")
    log_to_parent_run: bool = param.Boolean(
        default=False,
        doc="If true, hyperdrive child runs will log their metrics"
        "to their parent run.")

    use_imbalanced_sampler_for_training: bool = param.Boolean(
        default=False,
        doc="If True, use an imbalanced sampler during training.")
    drop_last_batch_in_training: bool = param.Boolean(
        default=False,
        doc="If True, drop the last incomplete batch during"
        "training. If all batches are complete, no batch gets "
        "dropped. If False, keep all batches.")
    log_summaries_to_files: bool = param.Boolean(
        default=True,
        doc=
        "If True, model summaries are logged to files in logs/model_summaries; "
        "if False, to stdout or driver log")
    mean_teacher_alpha: float = param.Number(
        bounds=(0, 1),
        allow_None=True,
        default=None,
        doc="If this value is set, the mean teacher model will be computed. "
        "Currently only supported for scalar models. In this case, we only "
        "report metrics and cross-validation results for "
        "the mean teacher model. Likewise the model used for inference "
        "is the mean teacher model. The student model is only used for "
        "training. Alpha is the momentum term for weight updates of the mean "
        "teacher model. After each training step the mean teacher model "
        "weights are updated using mean_teacher_"
        "weight = alpha * (mean_teacher_weight) "
        " + (1-alpha) * (current_student_weights). ")

    def __init__(self, **params: Any) -> None:
        self._model_name = type(self).__name__
        # This should be annotated as torch.utils.data.Dataset, but we don't want to import torch here.
        self._datasets_for_training: Optional[Dict[ModelExecutionMode,
                                                   Any]] = None
        self._datasets_for_inference: Optional[Dict[ModelExecutionMode,
                                                    Any]] = None
        super().__init__(throw_if_unknown_param=True, **params)
        logging.info("Creating the default output folder structure.")
        self.create_filesystem(fixed_paths.repository_root_directory())

    def validate(self) -> None:
        """
        Validates the parameters stored in the present object.
        """
        if len(self.adam_betas) < 2:
            raise ValueError(
                "The adam_betas parameter should be the coefficients used for computing running averages of "
                "gradient and its square")

        if self.azure_dataset_id is None and self.local_dataset is None:
            raise ValueError(
                "Either of local_dataset or azure_dataset_id must be set.")

        if self.number_of_cross_validation_splits == 1:
            raise ValueError(
                f"At least two splits required to perform cross validation found "
                f"number_of_cross_validation_splits={self.number_of_cross_validation_splits}"
            )
        if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
            raise ValueError(
                f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
                f"which is invalid for CV with {self.number_of_cross_validation_splits} splits."
            )
        elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
            raise ValueError(
                f"Cross validation split index must be -1 for a non cross validation run, "
                f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
                f"and cross_validation_split_index={self.cross_validation_split_index}"
            )

        if self.l_rate_scheduler == LRSchedulerType.MultiStep:
            if not self.l_rate_multi_step_milestones:
                raise ValueError(
                    "Must specify l_rate_multi_step_milestones to use LR scheduler MultiStep"
                )
            if sorted(set(self.l_rate_multi_step_milestones)
                      ) != self.l_rate_multi_step_milestones:
                raise ValueError(
                    "l_rate_multi_step_milestones must be a strictly increasing list"
                )
            if self.l_rate_multi_step_milestones[0] <= 0:
                raise ValueError(
                    "l_rate_multi_step_milestones cannot be negative or 0.")

    @property
    def model_name(self) -> str:
        """
        Gets the human readable name of the model (e.g., Liver). This is usually set from the class name.
        :return: A model name as a string.
        """
        return self._model_name

    @property
    def model_category(self) -> ModelCategory:
        """
        Gets the high-level model category that this configuration objects represents (segmentation or scalar output).
        """
        return self._model_category

    @property
    def is_segmentation_model(self) -> bool:
        """
        Returns True if the present model configuration belongs to the high-level category ModelCategory.Segmentation.
        """
        return self.model_category == ModelCategory.Segmentation

    @property
    def is_scalar_model(self) -> bool:
        """
        Returns True if the present model configuration belongs to the high-level category ModelCategory.Scalar
        i.e. for Classification or Regression models.
        """
        return self.model_category.is_scalar

    @property
    def compute_grad_cam(self) -> bool:
        return self.max_batch_grad_cam > 0

    @property
    def min_l_rate(self) -> float:
        return self._min_l_rate

    @min_l_rate.setter
    def min_l_rate(self, value: float) -> None:
        if value > self.l_rate:
            raise ValueError(
                "l_rate must be >= min_l_rate, found: {}, {}".format(
                    self.l_rate, value))
        self._min_l_rate = value

    @property
    def outputs_folder(self) -> Path:
        """Gets the full path in which the model outputs should be stored."""
        return self.file_system_config.outputs_folder

    @property
    def logs_folder(self) -> Path:
        """Gets the full path in which the model logs should be stored."""
        return self.file_system_config.logs_folder

    @property
    def checkpoint_folder(self) -> str:
        """Gets the full path in which the model checkpoints should be stored during training."""
        return str(self.outputs_folder / CHECKPOINT_FOLDER)

    @property
    def visualization_folder(self) -> Path:
        """Gets the full path in which the visualizations notebooks should be saved during training."""
        return self.outputs_folder / VISUALIZATION_FOLDER

    @property
    def perform_cross_validation(self) -> bool:
        """
        True if cross validation will be be performed as part of the training procedure.
        :return:
        """
        return self.number_of_cross_validation_splits > 1

    @property
    def overrides(self) -> Optional[Dict[str, Any]]:
        return self._overrides

    @property
    def dataset_data_frame(self) -> Optional[DataFrame]:
        """
        Gets the pandas data frame that the model uses.
        :return:
        """
        return self._dataset_data_frame

    @dataset_data_frame.setter
    def dataset_data_frame(self, data_frame: Optional[DataFrame]) -> None:
        """
        Sets the pandas data frame that the model uses.
        :param data_frame: The data frame to set.
        """
        self._dataset_data_frame = data_frame

    @property
    def metrics_data_frame_loggers(self) -> MetricsDataframeLoggers:
        """
        Gets the metrics data frame loggers for this config.
        :return:
        """
        return self._metrics_data_frame_loggers

    def set_output_to(self, output_to: PathOrString) -> None:
        """
        Adjusts the file system settings in the present object such that all outputs are written to the given folder.
        :param output_to: The absolute path to a folder that should contain the outputs.
        """
        if isinstance(output_to, Path):
            output_to = str(output_to)
        self.output_to = output_to
        self.create_filesystem()

    def create_filesystem(
        self, project_root: Path = fixed_paths.repository_root_directory()
    ) -> None:
        """
        Creates new file system settings (outputs folder, logs folder) based on the information stored in the
        present object. If any of the folders do not yet exist, they are created.
        :param project_root: The root folder for the codebase that triggers the training run.
        """
        self.file_system_config = DeepLearningFileSystemConfig.create(
            project_root=project_root,
            model_name=self.model_name,
            is_offline_run=self.is_offline_run,
            output_to=self.output_to)

    def create_dataframe_loggers(self) -> None:
        """
        Initializes the metrics loggers that are stored in self._metrics_data_frame_loggers
        :return:
        """
        self._metrics_data_frame_loggers = MetricsDataframeLoggers(
            outputs_folder=self.outputs_folder)

    def should_load_checkpoint_for_training(self) -> bool:
        """Returns true if start epoch > 0, that is, if an existing checkpoint is used to continue training."""
        return self.start_epoch > 0

    def should_save_epoch(self, epoch: int) -> bool:
        """Returns True if the present epoch should be saved, as per the save_start_epoch and save_step_epochs
        settings. Epoch writing starts with the first epoch that is >= save_start_epoch, and that
        is evenly divisible by save_step_epochs. A checkpoint is always written for the last epoch (num_epochs),
        such that it is easy to overwrite num_epochs on the commandline without having to change the test parameters
        at the same time.
        :param epoch: The current epoch. The first epoch is assumed to be 1."""
        should_save_epoch = epoch >= self.save_start_epoch \
                            and epoch % self.save_step_epochs == 0
        is_last_epoch = epoch == self.num_epochs
        return should_save_epoch or is_last_epoch

    def get_train_epochs(self) -> List[int]:
        """
        Returns the epochs for which training will be performed.
        :return:
        """
        return list(range(self.start_epoch + 1, self.num_epochs + 1))

    def get_total_number_of_training_epochs(self) -> int:
        """
        Returns the number of epochs for which a model will be trained.
        :return:
        """
        return len(self.get_train_epochs())

    def get_total_number_of_save_epochs(self) -> int:
        """
        Returns the number of epochs for which a model checkpoint will be saved.
        :return:
        """
        return len(
            list(filter(self.should_save_epoch, self.get_train_epochs())))

    def get_total_number_of_validation_epochs(self) -> int:
        """
        Returns the number of epochs for which a model will be validated.
        :return:
        """
        return self.get_total_number_of_training_epochs()

    def get_test_epochs(self) -> List[int]:
        """
        Returns the list of epochs for which the model should be evaluated on full images in the test set.
        These are all epochs starting at self.test_start_epoch, in intervals of self.n_steps_epoch.
        The last training epoch is always included. If either of the self.test_* fields is missing (set to None),
        only the last training epoch is returned.
        :return:
        """
        test_epochs = {self.num_epochs}
        if self.test_diff_epochs is not None and self.test_start_epoch is not None and \
                self.test_step_epochs is not None:
            for j in range(self.test_diff_epochs):
                epoch = self.test_start_epoch + self.test_step_epochs * j
                if epoch > self.num_epochs:
                    break
                test_epochs.add(epoch)
        return sorted(test_epochs)

    def get_path_to_checkpoint(self, epoch: int) -> Path:
        """
        Returns full path to a checkpoint given an epoch
        :param epoch: the epoch number
        :param for_mean_teacher_model: if True looking returns path to the mean teacher checkpoint. Else returns the
        path to the (main / student) model checkpoint.
        :return: path to a checkpoint given an epoch
        """
        return create_checkpoint_path(
            path=fixed_paths.repository_root_directory() /
            self.checkpoint_folder,
            epoch=epoch)

    def get_effective_random_seed(self) -> int:
        """
        Returns the random seed set as part of this configuration. If the configuration corresponds
        to a cross validation split, then the cross validation fold index will be added to the
        set random seed in order to return the effective random seed.
        :return:
        """
        seed = self.random_seed
        if self.perform_cross_validation:
            # offset the random seed based on the cross validation split index so each
            # fold has a different initial random state.
            seed += self.cross_validation_split_index
        return seed

    @property  # type: ignore
    def use_gpu(self) -> bool:  # type: ignore
        """
        Returns True if a CUDA capable GPU is present and should be used, False otherwise.
        """
        if self._use_gpu is None:
            # Use a local import here because we don't want the whole file to depend on pytorch.
            from InnerEye.ML.utils.ml_util import is_gpu_available
            self._use_gpu = is_gpu_available()
        return self._use_gpu

    @use_gpu.setter
    def use_gpu(self, value: bool) -> None:
        """
        Sets the flag that controls the use of the GPU. Raises a ValueError if the value is True, but no GPU is
        present.
        """
        if value:
            # Use a local import here because we don't want the whole file to depend on pytorch.
            from InnerEye.ML.utils.ml_util import is_gpu_available
            if not is_gpu_available():
                raise ValueError(
                    "Can't set use_gpu to True if there is not CUDA capable GPU present."
                )
        self._use_gpu = value

    @property
    def use_data_parallel(self) -> bool:
        """
        Data parallel is used if GPUs are usable and the number of CUDA devices are greater than 1.
        :return:
        """
        _devices = self.get_cuda_devices()
        return _devices is not None and len(_devices) > 1

    def write_args_file(self, root: Optional[Path] = None) -> None:
        """
        Writes the current config to disk. The file is written either to the given folder, or if omitted,
        to the default outputs folder.
        """
        dst = (root or self.outputs_folder) / ARGS_TXT
        dst.write_text(data=str(self))

    def should_wait_for_other_cross_val_child_runs(self) -> bool:
        """
        Returns True if the current run is an online run and is the 0th cross validation split.
        In this case, this will be the run that will wait for all other child runs to finish in order
        to aggregate their results.
        :return:
        """
        return (
            not self.is_offline_run) and self.cross_validation_split_index == 0

    @property
    def is_offline_run(self) -> bool:
        """
        Returns True if the run is executing outside AzureML, or False if inside AzureML.
        """
        return is_offline_run_context(RUN_CONTEXT)

    @property
    def compute_mean_teacher_model(self) -> bool:
        """
        Returns True if the mean teacher model should be computed.
        """
        return self.mean_teacher_alpha is not None

    def __str__(self) -> str:
        """Returns a string describing the present object, as a list of key == value pairs."""
        arguments_str = "\nArguments:\n"
        property_dict = vars(self)
        keys = sorted(property_dict)
        for key in keys:
            arguments_str += "\t{:18}: {}\n".format(key, property_dict[key])
        return arguments_str
from InnerEye.Azure.azure_util import is_running_on_azure_agent
from InnerEye.Common.common_util import is_windows
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.config import SegmentationModelBase, equally_weighted_classes
from InnerEye.ML.dataset.sample import PatientMetadata, Sample
from InnerEye.ML.plotting import resize_and_save, scan_with_transparent_overlay
from InnerEye.ML.utils import io_util
from InnerEye.ML.utils.image_util import get_unit_image_header
from InnerEye.ML.utils.io_util import load_nifti_image
from InnerEye.ML.utils.ml_util import set_random_seed
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops
from Tests.ML.util import assert_binary_files_match, assert_file_exists


@pytest.mark.skipif(is_windows(), reason="Plotting output is not consistent across platforms.")
@pytest.mark.parametrize("labels_to_boundary", [True, False])
def test_visualize_patch_sampling(test_output_dirs: OutputFolderForTests,
                                  labels_to_boundary: bool) -> None:
    """
    Tests if patch sampling and producing diagnostic images works as expected.
    :param test_output_dirs:
    :param labels_to_boundary: If true, the ground truth labels are placed close to the image boundary, so that
    crops have to be adjusted inwards. If false, ground truth labels are all far from the image boundaries.
    """
    set_random_seed(0)
    shape = (10, 30, 30)
    foreground_classes = ["fg"]
    class_weights = equally_weighted_classes(foreground_classes)
    config = SegmentationModelBase(should_validate=False,
                                   crop_size=(2, 10, 10),
    assert "Combination of input arguments is not recognized" in str(ex)


def compare_files(actual: List[Path], expected: List[str]) -> None:
    assert len(actual) == len(expected)
    for (f, e) in zip(actual, expected):
        assert f.exists()
        full_expected = full_ml_test_data_path(e)
        assert full_expected.exists()
        assert str(f).endswith(e)
        # To update the stored results, uncomment this line:
        # full_expected.write_bytes(f.read_bytes())
        assert file_as_bytes(f) == file_as_bytes(full_expected)


@pytest.mark.skipif(common_util.is_windows(), reason="Rendering of the graph is slightly different on Linux")
def test_plot_normalization_result(test_output_dirs: OutputFolderForTests) -> None:
    """
    Tests plotting of before/after histograms in photometric normalization.
    :return:
    """
    size = (3, 3, 3)
    image = np.zeros((1,) + size)
    for i, (z, y, x) in enumerate(itertools.product(range(size[0]), range(size[1]), range(size[2]))):
        image[0, z, y, x] = i
    labels = np.zeros((2,) + size)
    labels[1, 1, 1, 1] = 1
    sample = Sample(
        image=image,
        labels=labels,
        mask=np.ones(size),
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.configs.classification.DummyMulticlassClassification import DummyMulticlassClassification
from InnerEye.ML.configs.classification.GlaucomaPublic import GlaucomaPublic
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
from InnerEye.ML.metrics_dict import MetricsDict
from InnerEye.ML.reports.classification_report import ReportedScalarMetrics, get_correct_and_misclassified_examples, \
    get_image_filepath_from_subject_id, get_image_labels_from_subject_id, get_image_outputs_from_subject_id, \
    get_k_best_and_worst_performing, get_labels_and_predictions, get_metric, get_metrics_table_for_prediction_target, \
    plot_image_from_filepath
from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
    generate_classification_notebook
from InnerEye.ML.scalar_config import ScalarModelBase


@pytest.mark.skipif(is_windows(), reason="Random timeout errors on windows.")
def test_generate_classification_report(
        test_output_dirs: OutputFolderForTests) -> None:
    reports_folder = Path(__file__).parent
    test_metrics_file = reports_folder / "test_metrics_classification.csv"
    val_metrics_file = reports_folder / "val_metrics_classification.csv"

    config = ScalarModelBase(label_value_column="label",
                             image_file_column="filePath",
                             subject_column="subject")
    config.local_dataset = test_output_dirs.root_dir / "dataset"
    config.local_dataset.mkdir()
    dataset_csv = config.local_dataset / "dataset.csv"
    image_file_name = "image.npy"
    dataset_csv.write_text("subject,filePath,label\n"
                           f"0,0_{image_file_name},0\n"
Example #14
0
def _test_model_train(output_dirs: OutputFolderForTests,
                      image_channels: Any,
                      ground_truth_ids: Any,
                      no_mask_channel: bool = False) -> None:
    def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray],
                             should_equal: bool) -> None:
        patch_centers_epoch1 = diagnostics_per_epoch[0]
        assert len(
            diagnostics_per_epoch
        ) > 1, "Not enough data to check patch centers, need at least 2"
        for diagnostic in diagnostics_per_epoch[1:]:
            assert np.array_equal(patch_centers_epoch1,
                                  diagnostic) == should_equal

    def _check_voxel_count(results_per_epoch: List[Dict[str, float]],
                           expected_voxel_count_per_epoch: List[float],
                           prefix: str) -> None:
        assert len(results_per_epoch) == len(expected_voxel_count_per_epoch)
        for epoch, (results, voxel_count) in enumerate(
                zip(results_per_epoch, expected_voxel_count_per_epoch)):
            # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence
            # their voxel counts must be identical.
            for structure in ["region", "region_1"]:
                assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \
                    f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}"

    def _mean(a: List[float]) -> float:
        return sum(a) / len(a)

    def _mean_list(lists: List[List[float]]) -> List[float]:
        return list(map(_mean, lists))

    logging_to_stdout(log_level=logging.DEBUG)
    train_config = DummyModel()
    train_config.local_dataset = base_path
    train_config.set_output_to(output_dirs.root_dir)
    train_config.image_channels = image_channels
    train_config.ground_truth_ids = ground_truth_ids
    train_config.mask_id = None if no_mask_channel else train_config.mask_id
    train_config.random_seed = 42
    train_config.class_weights = [0.5, 0.25, 0.25]
    train_config.store_dataset_sample = no_mask_channel
    train_config.check_exclusive = False

    if machine_has_gpu:
        expected_train_losses = [0.4554231, 0.4550124]
        expected_val_losses = [0.4553894, 0.4553061]
    else:
        expected_train_losses = [0.4554231, 0.4550112]
        expected_val_losses = [0.4553893, 0.4553061]
    loss_absolute_tolerance = 1e-6
    expected_learning_rates = [train_config.l_rate, 5.3589e-4]

    model_training_result, _ = model_train_unittest(train_config,
                                                    output_folder=output_dirs)
    assert isinstance(model_training_result, StoringLogger)
    # Check that all metrics from the BatchTimeCallback are present
    # # TODO: re-enable once the BatchTimeCallback is fixed
    # for epoch, epoch_results in model_training_result.results_per_epoch.items():
    #     for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
    #         for metric_type in [BatchTimeCallback.EPOCH_TIME,
    #                             BatchTimeCallback.BATCH_TIME + " avg",
    #                             BatchTimeCallback.BATCH_TIME + " max",
    #                             BatchTimeCallback.EXCESS_LOADING_TIME]:
    #             expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
    #             assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
    #             # Excess loading time can be zero because that only measure batches over the threshold
    #             if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
    #                 value = epoch_results[expected]
    #                 assert isinstance(value, float)
    #                 assert value > 0.0, f"Time for {expected} should be > 0"

    actual_train_losses = model_training_result.get_train_metric(
        MetricType.LOSS.value)
    actual_val_losses = model_training_result.get_val_metric(
        MetricType.LOSS.value)
    print("actual_train_losses = {}".format(actual_train_losses))
    print("actual_val_losses = {}".format(actual_val_losses))

    def assert_all_close(metric: str, expected: List[float],
                         **kwargs: Any) -> None:
        actual = model_training_result.get_train_metric(metric)
        assert np.allclose(
            actual, expected, **kwargs
        ), f"Mismatch for {metric}: Got {actual}, expected {expected}"

    # check to make sure training batches are NOT all the same across epochs
    _check_patch_centers(model_training_result.train_diagnostics,
                         should_equal=False)
    # check to make sure validation batches are all the same across epochs
    _check_patch_centers(model_training_result.val_diagnostics,
                         should_equal=True)
    assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0])
    assert_all_close(MetricType.LEARNING_RATE.value,
                     expected_learning_rates,
                     rtol=1e-6)

    if is_windows():
        # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks.
        return

    # Simple regression test: Voxel counts should be the same in both epochs on the validation set,
    # and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
    # The following values are read off directly from the results of compute_dice_across_patches in the training loop
    # This checks that averages are computed correctly, and that metric computers are reset after each epoch.
    train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]]
    val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
    _check_voxel_count(model_training_result.train_results_per_epoch(),
                       _mean_list(train_voxels), "Train")
    _check_voxel_count(model_training_result.val_results_per_epoch(),
                       _mean_list(val_voxels), "Val")

    assert np.allclose(actual_train_losses,
                       expected_train_losses,
                       atol=loss_absolute_tolerance), "Train losses"
    assert np.allclose(actual_val_losses,
                       expected_val_losses,
                       atol=loss_absolute_tolerance), "Val losses"
    # Check that the metric we track for Hyperdrive runs is actually written.
    assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
    tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
    for val_result in model_training_result.val_results_per_epoch():
        assert tracked_metric in val_result

    # The following values are read off directly from the results of compute_dice_across_patches in the
    # training loop. Results are slightly different for GPU, hence use a larger tolerance there.
    dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
    train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]]
    train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]]
    # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
    # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
    # failing here, the losses match up to the expected tolerance.
    assert_all_close("Dice/region",
                     _mean_list(train_dice_region),
                     atol=dice_tolerance)
    assert_all_close("Dice/region_1",
                     _mean_list(train_dice_region1),
                     atol=dice_tolerance)
    expected_average_dice = [
        _mean(train_dice_region[i] + train_dice_region1[i])  # type: ignore
        for i in range(len(train_dice_region))
    ]
    assert_all_close("Dice/AverageAcrossStructures",
                     expected_average_dice,
                     atol=dice_tolerance)

    # check output files/directories
    assert train_config.outputs_folder.is_dir()
    assert train_config.logs_folder.is_dir()

    # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default)
    assert (train_config.logs_folder / "Lightning").is_dir()
    assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1

    assert train_config.num_epochs == 2
    # Checkpoint folder
    assert train_config.checkpoint_folder.is_dir()
    actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
    assert len(
        actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}"
    assert (train_config.checkpoint_folder /
            LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()

    # Path visualization: There should be 3 slices for each of the 2 subjects
    sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER
    assert sampling_folder.is_dir()
    assert train_config.show_patch_sampling > 0
    assert len(list(sampling_folder.rglob(
        "*.png"))) == 3 * train_config.show_patch_sampling

    # # Test for saving of example images
    assert train_config.example_images_folder.is_dir(
    ) if train_config.store_dataset_sample else True
    example_files = list(train_config.example_images_folder.rglob("*.*"))
    assert len(example_files) == (3 * 2 *
                                  2 if train_config.store_dataset_sample else 0
                                  )  # images x epochs x patients
Example #15
0
class DeepLearningConfig(WorkflowParams, DatasetParams, OutputParams,
                         OptimizerParams, TrainerParams, GenericConfig):
    """
    A class that holds all settings that are shared across segmentation models and regression/classification models.
    """
    _model_category: ModelCategory = param.ClassSelector(
        class_=ModelCategory,
        doc="The high-level model category described by this config.")

    num_dataload_workers: int = param.Integer(
        2,
        bounds=(0, None),
        doc="The number of data loading workers (processes). When set to 0,"
        "data loading is running in the same process (no process startup "
        "cost, hence good for use in unit testing. However, it "
        "does not give the same result as running with 1 worker process)")
    shuffle: bool = param.Boolean(
        True,
        doc="If true, the dataset will be shuffled randomly during training.")
    train_batch_size: int = param.Integer(
        4,
        bounds=(0, None),
        doc="The number of crops that make up one minibatch during training.")
    use_model_parallel: bool = param.Boolean(
        False,
        doc="If true, neural network model is partitioned across all "
        "available GPUs to fit in a large model. It shall not be used "
        "together with data parallel.")
    pin_memory: bool = param.Boolean(
        True, doc="Value of pin_memory argument to DataLoader")
    restrict_subjects: Optional[str] = \
        param.String(doc="Use at most this number of subjects for train, val, or test set (must be > 0 or None). "
                         "If None, do not modify the train, val, or test sets. If a string of the form 'i,j,k' where "
                         "i, j and k are integers, modify just the corresponding sets (i for train, j for val, k for "
                         "test). If any of i, j or j are missing or are negative, do not modify the corresponding "
                         "set. Thus a value of 20,,5 means limit training set to 20, keep validation set as is, and "
                         "limit test set to 5. If any of i,j,k is '+', discarded members of the other sets are added "
                         "to that set.",
                     allow_None=True)
    _dataset_data_frame: Optional[DataFrame] = \
        param.DataFrame(default=None,
                        doc="The dataframe that contains the dataset for the model. This is usually read from disk "
                            "from dataset.csv")
    avoid_process_spawn_in_data_loaders: bool = \
        param.Boolean(is_windows(), doc="If True, use a data loader logic that avoid spawning new processes at the "
                                        "start of each epoch. This speeds up training on both Windows and Linux, but"
                                        "on Linux, inference is currently disabled as the data loaders hang. "
                                        "If False, use the default data loader logic that starts new processes for "
                                        "each epoch.")
    max_batch_grad_cam: int = param.Integer(
        default=0,
        doc="Max number of validation batches for which "
        "to save gradCam images. By default "
        "visualizations are saved for all images "
        "in the validation set")
    label_smoothing_eps: float = param.Number(
        0.0,
        bounds=(0.0, 1.0),
        doc="Target smoothing value for label smoothing")
    log_to_parent_run: bool = param.Boolean(
        default=False,
        doc="If true, hyperdrive child runs will log their metrics"
        "to their parent run.")
    use_imbalanced_sampler_for_training: bool = param.Boolean(
        default=False,
        doc="If True, use an imbalanced sampler during training.")
    drop_last_batch_in_training: bool = param.Boolean(
        default=False,
        doc="If True, drop the last incomplete batch during"
        "training. If all batches are complete, no batch gets "
        "dropped. If False, keep all batches.")
    log_summaries_to_files: bool = param.Boolean(
        default=True,
        doc=
        "If True, model summaries are logged to files in logs/model_summaries; "
        "if False, to stdout or driver log")
    mean_teacher_alpha: float = param.Number(
        bounds=(0, 1),
        allow_None=True,
        default=None,
        doc="If this value is set, the mean teacher model will be computed. "
        "Currently only supported for scalar models. In this case, we only "
        "report metrics and cross-validation results for "
        "the mean teacher model. Likewise the model used for inference "
        "is the mean teacher model. The student model is only used for "
        "training. Alpha is the momentum term for weight updates of the mean "
        "teacher model. After each training step the mean teacher model "
        "weights are updated using mean_teacher_"
        "weight = alpha * (mean_teacher_weight) "
        " + (1-alpha) * (current_student_weights). ")
    #: Name of the csv file providing information on the dataset to be used.
    dataset_csv: str = param.String(
        DATASET_CSV_FILE_NAME,
        doc=
        "Name of the CSV file providing information on the dataset to be used. "
        "For segmentation models, this file must contain at least the fields: `subject`, `channel`, `filePath`."
    )

    def __init__(self, **params: Any) -> None:
        self._model_name = type(self).__name__
        # This should be annotated as torch.utils.data.Dataset, but we don't want to import torch here.
        self._datasets_for_training: Optional[Dict[ModelExecutionMode,
                                                   Any]] = None
        self._datasets_for_inference: Optional[Dict[ModelExecutionMode,
                                                    Any]] = None
        self.recovery_start_epoch = 0
        super().__init__(throw_if_unknown_param=True, **params)
        logging.info("Creating the default output folder structure.")
        self.create_filesystem(fixed_paths.repository_root_directory())
        # Disable the PL progress bar because all InnerEye models have their own console output
        self.pl_progress_bar_refresh_rate = 0
        self.extra_downloaded_run_id: Optional[Any] = None

    def validate(self) -> None:
        """
        Validates the parameters stored in the present object.
        """
        WorkflowParams.validate(self)
        OptimizerParams.validate(self)

        if self.azure_dataset_id is None and self.local_dataset is None:
            raise ValueError(
                "Either of local_dataset or azure_dataset_id must be set.")

    @property
    def model_category(self) -> ModelCategory:
        """
        Gets the high-level model category that this configuration objects represents (segmentation or scalar output).
        """
        return self._model_category

    @property
    def is_segmentation_model(self) -> bool:
        """
        Returns True if the present model configuration belongs to the high-level category ModelCategory.Segmentation.
        """
        return self.model_category == ModelCategory.Segmentation

    @property
    def is_scalar_model(self) -> bool:
        """
        Returns True if the present model configuration belongs to the high-level category ModelCategory.Scalar
        i.e. for Classification or Regression models.
        """
        return self.model_category.is_scalar

    @property
    def compute_grad_cam(self) -> bool:
        return self.max_batch_grad_cam > 0

    @property
    def dataset_data_frame(self) -> Optional[DataFrame]:
        """
        Gets the pandas data frame that the model uses.
        :return:
        """
        return self._dataset_data_frame

    @dataset_data_frame.setter
    def dataset_data_frame(self, data_frame: Optional[DataFrame]) -> None:
        """
        Sets the pandas data frame that the model uses.
        :param data_frame: The data frame to set.
        """
        self._dataset_data_frame = data_frame

    def get_train_epochs(self) -> List[int]:
        """
        Returns the epochs for which training will be performed.
        :return:
        """
        return list(range(self.recovery_start_epoch + 1, self.num_epochs + 1))

    def get_total_number_of_training_epochs(self) -> int:
        """
        Returns the number of epochs for which a model will be trained.
        :return:
        """
        return len(self.get_train_epochs())

    def get_total_number_of_validation_epochs(self) -> int:
        """
        Returns the number of epochs for which a model will be validated.
        :return:
        """
        return self.get_total_number_of_training_epochs()

    @property
    def compute_mean_teacher_model(self) -> bool:
        """
        Returns True if the mean teacher model should be computed.
        """
        return self.mean_teacher_alpha is not None

    def __str__(self) -> str:
        """Returns a string describing the present object, as a list of key: value strings."""
        arguments_str = "\nArguments:\n"
        # Avoid callable params, the bindings that are printed out can be humongous.
        # Avoid dataframes
        skip_params = {
            name
            for name, value in self.param.params().items()
            if isinstance(value, (param.Callable, param.DataFrame))
        }
        for key, value in self.param.get_param_values():
            if key not in skip_params:
                arguments_str += f"\t{key:40}: {value}\n"
        return arguments_str

    def load_checkpoint_and_modify(self,
                                   path_to_checkpoint: Path) -> Dict[str, Any]:
        """
        By default, uses torch.load to read and return the state dict from the checkpoint file, and does no modification
        of the checkpoint file.

        Overloading this function:
        When weights_url or local_weights_path is set, the file downloaded may not be in the exact
        format expected by the model's load_state_dict() - for example, pretrained Imagenet weights for networks
        may have mismatched layer names in different implementations.
        In such cases, you can overload this function to extract the state dict from the checkpoint.

        NOTE: The model checkpoint will be loaded using the torch function load_state_dict() with argument strict=False,
        so extra care needs to be taken to check that the state dict is valid.
        Check the logs for warnings related to missing and unexpected keys.
        See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
        -from-a-different-model
        for an explanation on why strict=False is useful when loading parameters from other models.
        :param path_to_checkpoint: Path to the checkpoint file.
        :return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
        1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
        2. Key ModelAndInfo.EPOCH_KEY and value set to the checkpoint epoch.
        Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
        ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
        """
        return load_checkpoint(path_to_checkpoint=path_to_checkpoint,
                               use_gpu=self.use_gpu)
def _test_model_train(output_dirs: OutputFolderForTests,
                      image_channels: Any,
                      ground_truth_ids: Any,
                      no_mask_channel: bool = False) -> None:
    def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray],
                             should_equal: bool) -> None:
        patch_centers_epoch1 = diagnostics_per_epoch[0]
        assert len(
            diagnostics_per_epoch
        ) > 1, "Not enough data to check patch centers, need at least 2"
        for diagnostic in diagnostics_per_epoch[1:]:
            assert np.array_equal(patch_centers_epoch1,
                                  diagnostic) == should_equal

    def _check_voxel_count(results_per_epoch: List[Dict[str, float]],
                           expected_voxel_count_per_epoch: List[float],
                           prefix: str) -> None:
        assert len(results_per_epoch) == len(expected_voxel_count_per_epoch)
        for epoch, (results, voxel_count) in enumerate(
                zip(results_per_epoch, expected_voxel_count_per_epoch)):
            # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence
            # their voxel counts must be identical.
            for structure in ["region", "region_1"]:
                assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \
                    f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}"

    def _mean(a: List[float]) -> float:
        return sum(a) / len(a)

    def _mean_list(lists: List[List[float]]) -> List[float]:
        return list(map(_mean, lists))

    logging_to_stdout(log_level=logging.DEBUG)
    train_config = DummyModel()
    train_config.local_dataset = base_path
    train_config.set_output_to(output_dirs.root_dir)
    train_config.image_channels = image_channels
    train_config.ground_truth_ids = ground_truth_ids
    train_config.mask_id = None if no_mask_channel else train_config.mask_id
    train_config.random_seed = 42
    train_config.class_weights = [0.5, 0.25, 0.25]
    train_config.store_dataset_sample = True
    train_config.recovery_checkpoint_save_interval = 1

    if machine_has_gpu:
        expected_train_losses = [0.4553468, 0.454904]
        expected_val_losses = [0.4553881, 0.4553041]
    else:
        expected_train_losses = [0.4553469, 0.4548947]
        expected_val_losses = [0.4553880, 0.4553041]
    loss_absolute_tolerance = 1e-6
    expected_learning_rates = [train_config.l_rate, 5.3589e-4]

    checkpoint_handler = get_default_checkpoint_handler(
        model_config=train_config, project_root=Path(output_dirs.root_dir))
    model_training_result = model_training.model_train(
        train_config, checkpoint_handler=checkpoint_handler)
    assert isinstance(model_training_result, ModelTrainingResults)

    def assert_all_close(metric: str, expected: List[float],
                         **kwargs: Any) -> None:
        actual = model_training_result.get_training_metric(metric)
        assert np.allclose(
            actual, expected, **kwargs
        ), f"Mismatch for {metric}: Got {actual}, expected {expected}"

    # check to make sure training batches are NOT all the same across epochs
    _check_patch_centers(model_training_result.train_diagnostics,
                         should_equal=False)
    # check to make sure validation batches are all the same across epochs
    _check_patch_centers(model_training_result.val_diagnostics,
                         should_equal=True)
    assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0])
    assert_all_close(MetricType.LEARNING_RATE.value,
                     expected_learning_rates,
                     rtol=1e-6)

    if is_windows():
        # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks.
        return

    # Simple regression test: Voxel counts should be the same in both epochs on the validation set,
    # and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
    # The following values are read off directly from the results of compute_dice_across_patches in the training loop
    # This checks that averages are computed correctly, and that metric computers are reset after each epoch.
    train_voxels = [[83092.0, 83212.0, 82946.0], [83000.0, 82881.0, 83309.0]]
    val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
    _check_voxel_count(model_training_result.train_results_per_epoch,
                       _mean_list(train_voxels), "Train")
    _check_voxel_count(model_training_result.val_results_per_epoch,
                       _mean_list(val_voxels), "Val")

    actual_train_losses = model_training_result.get_training_metric(
        MetricType.LOSS.value)
    actual_val_losses = model_training_result.get_validation_metric(
        MetricType.LOSS.value)
    print("actual_train_losses = {}".format(actual_train_losses))
    print("actual_val_losses = {}".format(actual_val_losses))
    assert np.allclose(actual_train_losses,
                       expected_train_losses,
                       atol=loss_absolute_tolerance), "Train losses"
    assert np.allclose(actual_val_losses,
                       expected_val_losses,
                       atol=loss_absolute_tolerance), "Val losses"
    # Check that the metric we track for Hyperdrive runs is actually written.
    assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
    tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
    for val_result in model_training_result.val_results_per_epoch:
        assert tracked_metric in val_result

    # The following values are read off directly from the results of compute_dice_across_patches in the
    # training loop. Results are slightly different for CPU, hence use a larger tolerance there.
    dice_tolerance = 1e-4 if machine_has_gpu else 4.5e-4
    train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]]
    train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]]
    # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
    # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
    # failing here, the losses match up to the expected tolerance.
    assert_all_close("Dice/region",
                     _mean_list(train_dice_region),
                     atol=dice_tolerance)
    assert_all_close("Dice/region_1",
                     _mean_list(train_dice_region1),
                     atol=dice_tolerance)
    expected_average_dice = [
        _mean(train_dice_region[i] + train_dice_region1[i])  # type: ignore
        for i in range(len(train_dice_region))
    ]
    assert_all_close("Dice/AverageAcrossStructures",
                     expected_average_dice,
                     atol=dice_tolerance)

    # check output files/directories
    assert train_config.outputs_folder.is_dir()
    assert train_config.logs_folder.is_dir()

    # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default)
    assert (train_config.logs_folder / "Lightning").is_dir()
    assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1

    assert train_config.num_epochs == 2
    # Checkpoint folder
    assert train_config.checkpoint_folder.is_dir()
    actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
    assert len(
        actual_checkpoints) == 2, f"Actual checkpoints: {actual_checkpoints}"
    assert (train_config.checkpoint_folder /
            RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.checkpoint_folder /
            BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()

    # Path visualization: There should be 3 slices for each of the 2 subjects
    sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER
    assert sampling_folder.is_dir()
    assert train_config.show_patch_sampling > 0
    assert len(list(sampling_folder.rglob(
        "*.png"))) == 3 * train_config.show_patch_sampling

    # Time per epoch: Test that we have all these times logged.
    model_training_result.get_training_metric(
        MetricType.SECONDS_PER_EPOCH.value)
    model_training_result.get_validation_metric(
        MetricType.SECONDS_PER_EPOCH.value)
    model_training_result.get_validation_metric(
        MetricType.SECONDS_PER_BATCH.value)
    model_training_result.get_training_metric(
        MetricType.SECONDS_PER_BATCH.value)
Example #17
0
    which are provided as input into the computational graph
    :return:
    """
    loader = cropping_dataset.as_data_loader(
        shuffle=True, batch_size=2, num_dataload_workers=num_dataload_workers)
    for i, item in enumerate(loader):
        item = CroppedSample.from_dict(item)
        assert item.image.numpy().dtype == ImageDataType.IMAGE.value
        assert item.labels.numpy().dtype == ImageDataType.SEGMENTATION.value
        assert item.mask.numpy().dtype == ImageDataType.MASK.value
        assert item.mask_center_crop.numpy().dtype == ImageDataType.MASK.value
        assert item.labels_center_crop.numpy(
        ).dtype == ImageDataType.SEGMENTATION.value


@pytest.mark.skipif(common_util.is_windows(),
                    reason="Has issues on windows build")
def test_cropping_dataset_padding(cropping_dataset: CroppingDataset,
                                  num_dataload_workers: int) -> None:
    """
    Tests the data type of torch tensors (e.g. image, labels, and mask) created by the dataset generator,
    which are provided as input into the computational graph
    :return:
    """
    cropping_dataset.args.crop_size = (300, 300, 300)
    cropping_dataset.args.padding_mode = PaddingMode.Zero
    loader = cropping_dataset.as_data_loader(shuffle=True,
                                             batch_size=2,
                                             num_dataload_workers=1)

    for i, item in enumerate(loader):
from InnerEye.Common.common_util import is_windows
from InnerEye.Common.output_directories import TestOutputDirectories
from InnerEye.ML.config import SegmentationModelBase, equally_weighted_classes
from InnerEye.ML.dataset.sample import PatientMetadata, Sample
from InnerEye.ML.plotting import resize_and_save, scan_with_transparent_overlay
from InnerEye.ML.utils import io_util
from InnerEye.ML.utils.image_util import get_unit_image_header
from InnerEye.ML.utils.io_util import load_nifti_image
from InnerEye.ML.utils.ml_util import set_random_seed
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops
from Tests.ML.util import assert_binary_files_match, assert_file_exists, is_running_on_azure
from Tests.fixed_paths_for_tests import full_ml_test_data_path


@pytest.mark.skipif(
    is_windows(), reason="Plotting output is not consistent across platforms.")
@pytest.mark.parametrize("labels_to_boundary", [True, False])
def test_visualize_patch_sampling(test_output_dirs: TestOutputDirectories,
                                  labels_to_boundary: bool) -> None:
    """
    Tests if patch sampling and producing diagnostic images works as expected.
    :param test_output_dirs:
    :param labels_to_boundary: If true, the ground truth labels are placed close to the image boundary, so that
    crops have to be adjusted inwards. If false, ground truth labels are all far from the image boundaries.
    """
    set_random_seed(0)
    shape = (10, 30, 30)
    foreground_classes = ["fg"]
    class_weights = equally_weighted_classes(foreground_classes)
    config = SegmentationModelBase(should_validate=False,
                                   crop_size=(2, 10, 10),
def check_log_file(path: Path, expected_csv: str,
                   ignore_columns: List[str]) -> None:
    df_expected = pd.read_csv(StringIO(expected_csv))
    df_epoch_metrics_actual = pd.read_csv(path)
    for ignore_column in ignore_columns:
        assert ignore_column in df_epoch_metrics_actual
        # We cannot compare time because in different machines this takes different times
        del df_epoch_metrics_actual[ignore_column]
        if ignore_column in df_expected:
            del df_expected[ignore_column]
    pd.testing.assert_frame_equal(df_expected,
                                  df_epoch_metrics_actual,
                                  check_less_precise=True)


@pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@pytest.mark.parametrize("model_name",
                         ["DummyClassification", "DummyRegression"])
@pytest.mark.parametrize("number_of_offline_cross_validation_splits", [2])
@pytest.mark.parametrize("number_of_cross_validation_splits_per_fold", [2])
def test_run_ml_with_classification_model(
        test_output_dirs: TestOutputDirectories,
        number_of_offline_cross_validation_splits: int,
        number_of_cross_validation_splits_per_fold: int,
        model_name: str) -> None:
    """
    Test training and testing of classification models, when it is started together via run_ml.
    """
    logging_to_stdout()
    azure_config = get_default_azure_config()
    azure_config.train = True
class WorkflowParams(param.Parameterized):
    """
    This class contains all parameters that affect how the whole training and testing workflow is executed.
    """
    random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
    number_of_cross_validation_splits: int = param.Integer(0, bounds=(0, None),
                                                           doc="Number of cross validation splits for k-fold cross "
                                                               "validation")
    cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
                                                      doc="The index of the cross validation fold this model is "
                                                          "associated with when performing k-fold cross validation")
    inference_on_train_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on training set after training.")
    inference_on_val_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on validation set after training.")
    inference_on_test_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on test set after training.")
    ensemble_inference_on_train_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on the training set after ensemble training.")
    ensemble_inference_on_val_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on validation set after ensemble training.")
    ensemble_inference_on_test_set: Optional[bool] = \
        param.Boolean(None,
                      doc="If set, enable/disable full image inference on test set after ensemble training.")
    weights_url: List[str] = param.List(default=[], class_=str,
                                        doc="If provided, a set of urls from which checkpoints will be downloaded"
                                            "and used for inference.")
    local_weights_path: List[Path] = param.List(default=[], class_=Path,
                                                doc="A list of checkpoints paths to use for inference, "
                                                    "when the job is running outside Azure.")
    model_id: str = param.String(default="",
                                 doc="A model id string in the form 'model name:version' "
                                     "to use a registered model for inference.")
    generate_report: bool = param.Boolean(default=True,
                                          doc="If True (default), write a modelling report in HTML format. If False,"
                                              "do not write that report.")
    pretraining_run_recovery_id: str = param.String(default=None,
                                                    allow_None=True,
                                                    doc="Extra run recovery id to download checkpoints from,"
                                                        "for custom modules (e.g. for loading pretrained weights)."
                                                        "The downloaded RunRecovery object will be available in"
                                                        "pretraining_run_checkpoints.")

    # The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
    # "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
    # can reduce the chance of stuck jobs.
    multiprocessing_start_method: MultiprocessingStartMethod = \
        param.ClassSelector(class_=MultiprocessingStartMethod,
                            default=(MultiprocessingStartMethod.spawn if is_windows()
                                     else MultiprocessingStartMethod.fork),
                            doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
                                "fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
                                "Set to forkserver as a possible remedy for stuck jobs.")
    monitoring_interval_seconds: int = param.Integer(0, doc="Seconds delay between logging GPU/CPU resource "
                                                            "statistics. If 0 or less, do not log any resource "
                                                            "statistics.")
    regression_test_folder: Optional[Path] = \
        param.ClassSelector(class_=Path, default=None, allow_None=True,
                            doc="A path to a folder that contains a set of files. At the end of training and "
                                "model evaluation, all files given in that folder must be present in the job's output "
                                "folder, and their contents must match exactly. When running in AzureML, you need to "
                                "ensure that this folder is part of the snapshot that gets uploaded. The path should "
                                "be relative to the repository root directory.")
    regression_test_csv_tolerance: float = \
        param.Number(default=0.0, allow_None=False,
                     doc="When comparing CSV files during regression tests, use this value as the maximum allowed "
                         "relative difference of actual and expected results. Default: 0.0 (must match exactly)")

    def validate(self) -> None:
        if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
            raise ValueError("Cannot specify more than one of local_weights_path, weights_url or model_id.")

        if self.model_id:
            if len(self.model_id.split(":")) != 2:
                raise ValueError(
                    f"model_id should be in the form 'model_name:version', got {self.model_id}")

        if self.number_of_cross_validation_splits == 1:
            raise ValueError("At least two splits required to perform cross validation, but got "
                             f"{self.number_of_cross_validation_splits}. To train without cross validation, set "
                             "number_of_cross_validation_splits=0.")
        if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
            raise ValueError(f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
                             f"which is invalid for CV with {self.number_of_cross_validation_splits} splits.")
        elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
            raise ValueError(f"Cross validation split index must be -1 for a non cross validation run, "
                             f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
                             f"and cross_validation_split_index={self.cross_validation_split_index}")

    def is_inference_required(self,
                              model_proc: ModelProcessing,
                              data_split: ModelExecutionMode) -> bool:
        """
        Returns True if inference is required for this model_proc (single or ensemble) and data_split (Train/Val/Test).
        :param model_proc: Whether we are testing an ensemble or single model.
        :param data_split: Indicates which of the 3 sets (training, test, or validation) is being processed.
        :return: True if inference required.
        """
        settings = {
            ModelProcessing.DEFAULT: {
                ModelExecutionMode.TRAIN: self.inference_on_train_set,
                ModelExecutionMode.TEST: self.inference_on_test_set,
                ModelExecutionMode.VAL: self.inference_on_val_set,
            },
            ModelProcessing.ENSEMBLE_CREATION: {
                ModelExecutionMode.TRAIN: self.ensemble_inference_on_train_set,
                ModelExecutionMode.TEST: self.ensemble_inference_on_test_set,
                ModelExecutionMode.VAL: self.ensemble_inference_on_val_set,
            }
        }
        inference_option = settings[model_proc][data_split]
        if inference_option is not None:
            return inference_option

        # Defaults for when to run inference in the absence of any command line switches.
        # This depends on ModelProcessing, perform_cross_validation, and ModelExecutionMode.
        # If the current combination of these three parameters is not in this data structure,
        # then default to False.
        defaults: Dict[ModelProcessing, Dict[bool, Dict[ModelExecutionMode, bool]]] = {
            ModelProcessing.DEFAULT: {
                False: {
                    ModelExecutionMode.TRAIN: False,
                    ModelExecutionMode.VAL: False,
                    ModelExecutionMode.TEST: True,
                }
            },
            ModelProcessing.ENSEMBLE_CREATION: {
                True: {
                    ModelExecutionMode.TRAIN: False,
                    ModelExecutionMode.VAL: False,
                    ModelExecutionMode.TEST: True,
                }
            }
        }
        try:
            return defaults[model_proc][self.perform_cross_validation][data_split]
        except KeyError:
            return False

    @property
    def is_offline_run(self) -> bool:
        """
        Returns True if the run is executing outside AzureML, or False if inside AzureML.
        """
        return is_offline_run_context(RUN_CONTEXT)

    @property
    def perform_cross_validation(self) -> bool:
        """
        True if cross validation will be be performed as part of the training procedure.
        :return:
        """
        return self.number_of_cross_validation_splits > 1

    def get_effective_random_seed(self) -> int:
        """
        Returns the random seed set as part of this configuration. If the configuration corresponds
        to a cross validation split, then the cross validation fold index will be added to the
        set random seed in order to return the effective random seed.
        :return:
        """
        seed = self.random_seed
        if self.perform_cross_validation:
            # offset the random seed based on the cross validation split index so each
            # fold has a different initial random state.
            seed += self.cross_validation_split_index
        return seed