def _test_model_train(output_dirs: TestOutputDirectories,
                      image_channels: Any,
                      ground_truth_ids: Any,
                      no_mask_channel: bool = False) -> None:
    def _check_patch_centers(epoch_results: List[MetricsDict],
                             should_equal: bool) -> None:
        diagnostics_per_epoch = [
            m.diagnostics[MetricType.PATCH_CENTER.value] for m in epoch_results
        ]
        patch_centers_epoch1 = diagnostics_per_epoch[0]
        for diagnostic in diagnostics_per_epoch[1:]:
            assert np.array_equal(patch_centers_epoch1,
                                  diagnostic) == should_equal

    train_config = DummyModel()
    train_config.local_dataset = base_path
    train_config.set_output_to(output_dirs.root_dir)
    train_config.image_channels = image_channels
    train_config.ground_truth_ids = ground_truth_ids
    train_config.mask_id = None if no_mask_channel else train_config.mask_id
    train_config.random_seed = 42
    train_config.class_weights = [0.5, 0.25, 0.25]
    train_config.store_dataset_sample = True

    expected_train_losses = [0.455538, 0.455213]
    expected_val_losses = [0.455190, 0.455139]

    expected_stats = "Epoch\tLearningRate\tTrainLoss\tTrainDice\tValLoss\tValDice\n" \
                     "1\t1.00e-03\t0.456\t0.242\t0.455\t0.000\n" \
                     "2\t5.36e-04\t0.455\t0.247\t0.455\t0.000"

    expected_learning_rates = [[train_config.l_rate], [5.3589e-4]]

    loss_absolute_tolerance = 1e-3
    model_training_result = model_training.model_train(train_config)
    assert isinstance(model_training_result, ModelTrainingResults)

    # check to make sure training batches are NOT all the same across epochs
    _check_patch_centers(model_training_result.train_results_per_epoch,
                         should_equal=False)
    # check to make sure validation batches are all the same across epochs
    _check_patch_centers(model_training_result.val_results_per_epoch,
                         should_equal=True)
    assert isinstance(model_training_result.train_results_per_epoch[0],
                      MetricsDict)
    actual_train_losses = [
        m.get_single_metric(MetricType.LOSS)
        for m in model_training_result.train_results_per_epoch
    ]
    actual_val_losses = [
        m.get_single_metric(MetricType.LOSS)
        for m in model_training_result.val_results_per_epoch
    ]
    print("actual_train_losses = {}".format(actual_train_losses))
    print("actual_val_losses = {}".format(actual_val_losses))
    assert np.allclose(actual_train_losses,
                       expected_train_losses,
                       atol=loss_absolute_tolerance)
    assert np.allclose(actual_val_losses,
                       expected_val_losses,
                       atol=loss_absolute_tolerance)
    assert np.allclose(model_training_result.learning_rates_per_epoch,
                       expected_learning_rates,
                       rtol=1e-6)

    # check output files/directories
    assert train_config.outputs_folder.is_dir()
    assert train_config.logs_folder.is_dir()

    # The train and val folder should contain Tensorflow event files
    assert (train_config.logs_folder / "train").is_dir()
    assert (train_config.logs_folder / "val").is_dir()
    assert len([(train_config.logs_folder / "train").glob("*")]) == 1
    assert len([(train_config.logs_folder / "val").glob("*")]) == 1

    # Checkpoint folder
    # With these settings, we should see a checkpoint only at epoch 2:
    # That's the last epoch, and there should always be checkpoint at the last epoch)
    assert train_config.save_start_epoch == 1
    assert train_config.save_step_epochs == 100
    assert train_config.num_epochs == 2
    assert os.path.isdir(train_config.checkpoint_folder)
    assert os.path.isfile(
        os.path.join(train_config.checkpoint_folder,
                     "2" + CHECKPOINT_FILE_SUFFIX))
    assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()
    assert_file_contents(train_config.outputs_folder / TRAIN_STATS_FILE,
                         expected_stats)

    # Test for saving of example images
    assert os.path.isdir(train_config.example_images_folder)
    example_files = os.listdir(train_config.example_images_folder)
    assert len(example_files) == 3 * 2
def _test_model_train(output_dirs: OutputFolderForTests,
                      image_channels: Any,
                      ground_truth_ids: Any,
                      no_mask_channel: bool = False) -> None:
    def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray],
                             should_equal: bool) -> None:
        patch_centers_epoch1 = diagnostics_per_epoch[0]
        assert len(
            diagnostics_per_epoch
        ) > 1, "Not enough data to check patch centers, need at least 2"
        for diagnostic in diagnostics_per_epoch[1:]:
            assert np.array_equal(patch_centers_epoch1,
                                  diagnostic) == should_equal

    def _check_voxel_count(results_per_epoch: List[Dict[str, float]],
                           expected_voxel_count_per_epoch: List[float],
                           prefix: str) -> None:
        assert len(results_per_epoch) == len(expected_voxel_count_per_epoch)
        for epoch, (results, voxel_count) in enumerate(
                zip(results_per_epoch, expected_voxel_count_per_epoch)):
            # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence
            # their voxel counts must be identical.
            for structure in ["region", "region_1"]:
                assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \
                    f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}"

    def _mean(a: List[float]) -> float:
        return sum(a) / len(a)

    def _mean_list(lists: List[List[float]]) -> List[float]:
        return list(map(_mean, lists))

    logging_to_stdout(log_level=logging.DEBUG)
    train_config = DummyModel()
    train_config.local_dataset = base_path
    train_config.set_output_to(output_dirs.root_dir)
    train_config.image_channels = image_channels
    train_config.ground_truth_ids = ground_truth_ids
    train_config.mask_id = None if no_mask_channel else train_config.mask_id
    train_config.random_seed = 42
    train_config.class_weights = [0.5, 0.25, 0.25]
    train_config.store_dataset_sample = True
    train_config.recovery_checkpoint_save_interval = 1

    if machine_has_gpu:
        expected_train_losses = [0.4553468, 0.454904]
        expected_val_losses = [0.4553881, 0.4553041]
    else:
        expected_train_losses = [0.4553469, 0.4548947]
        expected_val_losses = [0.4553880, 0.4553041]
    loss_absolute_tolerance = 1e-6
    expected_learning_rates = [train_config.l_rate, 5.3589e-4]

    checkpoint_handler = get_default_checkpoint_handler(
        model_config=train_config, project_root=Path(output_dirs.root_dir))
    model_training_result = model_training.model_train(
        train_config, checkpoint_handler=checkpoint_handler)
    assert isinstance(model_training_result, ModelTrainingResults)

    def assert_all_close(metric: str, expected: List[float],
                         **kwargs: Any) -> None:
        actual = model_training_result.get_training_metric(metric)
        assert np.allclose(
            actual, expected, **kwargs
        ), f"Mismatch for {metric}: Got {actual}, expected {expected}"

    # check to make sure training batches are NOT all the same across epochs
    _check_patch_centers(model_training_result.train_diagnostics,
                         should_equal=False)
    # check to make sure validation batches are all the same across epochs
    _check_patch_centers(model_training_result.val_diagnostics,
                         should_equal=True)
    assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0])
    assert_all_close(MetricType.LEARNING_RATE.value,
                     expected_learning_rates,
                     rtol=1e-6)

    if is_windows():
        # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks.
        return

    # Simple regression test: Voxel counts should be the same in both epochs on the validation set,
    # and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
    # The following values are read off directly from the results of compute_dice_across_patches in the training loop
    # This checks that averages are computed correctly, and that metric computers are reset after each epoch.
    train_voxels = [[83092.0, 83212.0, 82946.0], [83000.0, 82881.0, 83309.0]]
    val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
    _check_voxel_count(model_training_result.train_results_per_epoch,
                       _mean_list(train_voxels), "Train")
    _check_voxel_count(model_training_result.val_results_per_epoch,
                       _mean_list(val_voxels), "Val")

    actual_train_losses = model_training_result.get_training_metric(
        MetricType.LOSS.value)
    actual_val_losses = model_training_result.get_validation_metric(
        MetricType.LOSS.value)
    print("actual_train_losses = {}".format(actual_train_losses))
    print("actual_val_losses = {}".format(actual_val_losses))
    assert np.allclose(actual_train_losses,
                       expected_train_losses,
                       atol=loss_absolute_tolerance), "Train losses"
    assert np.allclose(actual_val_losses,
                       expected_val_losses,
                       atol=loss_absolute_tolerance), "Val losses"
    # Check that the metric we track for Hyperdrive runs is actually written.
    assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
    tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
    for val_result in model_training_result.val_results_per_epoch:
        assert tracked_metric in val_result

    # The following values are read off directly from the results of compute_dice_across_patches in the
    # training loop. Results are slightly different for CPU, hence use a larger tolerance there.
    dice_tolerance = 1e-4 if machine_has_gpu else 4.5e-4
    train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]]
    train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]]
    # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
    # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
    # failing here, the losses match up to the expected tolerance.
    assert_all_close("Dice/region",
                     _mean_list(train_dice_region),
                     atol=dice_tolerance)
    assert_all_close("Dice/region_1",
                     _mean_list(train_dice_region1),
                     atol=dice_tolerance)
    expected_average_dice = [
        _mean(train_dice_region[i] + train_dice_region1[i])  # type: ignore
        for i in range(len(train_dice_region))
    ]
    assert_all_close("Dice/AverageAcrossStructures",
                     expected_average_dice,
                     atol=dice_tolerance)

    # check output files/directories
    assert train_config.outputs_folder.is_dir()
    assert train_config.logs_folder.is_dir()

    # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default)
    assert (train_config.logs_folder / "Lightning").is_dir()
    assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1

    assert train_config.num_epochs == 2
    # Checkpoint folder
    assert train_config.checkpoint_folder.is_dir()
    actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
    assert len(
        actual_checkpoints) == 2, f"Actual checkpoints: {actual_checkpoints}"
    assert (train_config.checkpoint_folder /
            RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.checkpoint_folder /
            BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()

    # Path visualization: There should be 3 slices for each of the 2 subjects
    sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER
    assert sampling_folder.is_dir()
    assert train_config.show_patch_sampling > 0
    assert len(list(sampling_folder.rglob(
        "*.png"))) == 3 * train_config.show_patch_sampling

    # Time per epoch: Test that we have all these times logged.
    model_training_result.get_training_metric(
        MetricType.SECONDS_PER_EPOCH.value)
    model_training_result.get_validation_metric(
        MetricType.SECONDS_PER_EPOCH.value)
    model_training_result.get_validation_metric(
        MetricType.SECONDS_PER_BATCH.value)
    model_training_result.get_training_metric(
        MetricType.SECONDS_PER_BATCH.value)
Example #3
0
def _test_model_train(output_dirs: OutputFolderForTests,
                      image_channels: Any,
                      ground_truth_ids: Any,
                      no_mask_channel: bool = False) -> None:
    def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray],
                             should_equal: bool) -> None:
        patch_centers_epoch1 = diagnostics_per_epoch[0]
        assert len(
            diagnostics_per_epoch
        ) > 1, "Not enough data to check patch centers, need at least 2"
        for diagnostic in diagnostics_per_epoch[1:]:
            assert np.array_equal(patch_centers_epoch1,
                                  diagnostic) == should_equal

    def _check_voxel_count(results_per_epoch: List[Dict[str, float]],
                           expected_voxel_count_per_epoch: List[float],
                           prefix: str) -> None:
        assert len(results_per_epoch) == len(expected_voxel_count_per_epoch)
        for epoch, (results, voxel_count) in enumerate(
                zip(results_per_epoch, expected_voxel_count_per_epoch)):
            # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence
            # their voxel counts must be identical.
            for structure in ["region", "region_1"]:
                assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \
                    f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}"

    def _mean(a: List[float]) -> float:
        return sum(a) / len(a)

    def _mean_list(lists: List[List[float]]) -> List[float]:
        return list(map(_mean, lists))

    logging_to_stdout(log_level=logging.DEBUG)
    train_config = DummyModel()
    train_config.local_dataset = base_path
    train_config.set_output_to(output_dirs.root_dir)
    train_config.image_channels = image_channels
    train_config.ground_truth_ids = ground_truth_ids
    train_config.mask_id = None if no_mask_channel else train_config.mask_id
    train_config.random_seed = 42
    train_config.class_weights = [0.5, 0.25, 0.25]
    train_config.store_dataset_sample = no_mask_channel
    train_config.check_exclusive = False

    if machine_has_gpu:
        expected_train_losses = [0.4554231, 0.4550124]
        expected_val_losses = [0.4553894, 0.4553061]
    else:
        expected_train_losses = [0.4554231, 0.4550112]
        expected_val_losses = [0.4553893, 0.4553061]
    loss_absolute_tolerance = 1e-6
    expected_learning_rates = [train_config.l_rate, 5.3589e-4]

    model_training_result, _ = model_train_unittest(train_config,
                                                    output_folder=output_dirs)
    assert isinstance(model_training_result, StoringLogger)
    # Check that all metrics from the BatchTimeCallback are present
    # # TODO: re-enable once the BatchTimeCallback is fixed
    # for epoch, epoch_results in model_training_result.results_per_epoch.items():
    #     for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
    #         for metric_type in [BatchTimeCallback.EPOCH_TIME,
    #                             BatchTimeCallback.BATCH_TIME + " avg",
    #                             BatchTimeCallback.BATCH_TIME + " max",
    #                             BatchTimeCallback.EXCESS_LOADING_TIME]:
    #             expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
    #             assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
    #             # Excess loading time can be zero because that only measure batches over the threshold
    #             if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
    #                 value = epoch_results[expected]
    #                 assert isinstance(value, float)
    #                 assert value > 0.0, f"Time for {expected} should be > 0"

    actual_train_losses = model_training_result.get_train_metric(
        MetricType.LOSS.value)
    actual_val_losses = model_training_result.get_val_metric(
        MetricType.LOSS.value)
    print("actual_train_losses = {}".format(actual_train_losses))
    print("actual_val_losses = {}".format(actual_val_losses))

    def assert_all_close(metric: str, expected: List[float],
                         **kwargs: Any) -> None:
        actual = model_training_result.get_train_metric(metric)
        assert np.allclose(
            actual, expected, **kwargs
        ), f"Mismatch for {metric}: Got {actual}, expected {expected}"

    # check to make sure training batches are NOT all the same across epochs
    _check_patch_centers(model_training_result.train_diagnostics,
                         should_equal=False)
    # check to make sure validation batches are all the same across epochs
    _check_patch_centers(model_training_result.val_diagnostics,
                         should_equal=True)
    assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0])
    assert_all_close(MetricType.LEARNING_RATE.value,
                     expected_learning_rates,
                     rtol=1e-6)

    if is_windows():
        # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks.
        return

    # Simple regression test: Voxel counts should be the same in both epochs on the validation set,
    # and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
    # The following values are read off directly from the results of compute_dice_across_patches in the training loop
    # This checks that averages are computed correctly, and that metric computers are reset after each epoch.
    train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]]
    val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
    _check_voxel_count(model_training_result.train_results_per_epoch(),
                       _mean_list(train_voxels), "Train")
    _check_voxel_count(model_training_result.val_results_per_epoch(),
                       _mean_list(val_voxels), "Val")

    assert np.allclose(actual_train_losses,
                       expected_train_losses,
                       atol=loss_absolute_tolerance), "Train losses"
    assert np.allclose(actual_val_losses,
                       expected_val_losses,
                       atol=loss_absolute_tolerance), "Val losses"
    # Check that the metric we track for Hyperdrive runs is actually written.
    assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
    tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
    for val_result in model_training_result.val_results_per_epoch():
        assert tracked_metric in val_result

    # The following values are read off directly from the results of compute_dice_across_patches in the
    # training loop. Results are slightly different for GPU, hence use a larger tolerance there.
    dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
    train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]]
    train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]]
    # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
    # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
    # failing here, the losses match up to the expected tolerance.
    assert_all_close("Dice/region",
                     _mean_list(train_dice_region),
                     atol=dice_tolerance)
    assert_all_close("Dice/region_1",
                     _mean_list(train_dice_region1),
                     atol=dice_tolerance)
    expected_average_dice = [
        _mean(train_dice_region[i] + train_dice_region1[i])  # type: ignore
        for i in range(len(train_dice_region))
    ]
    assert_all_close("Dice/AverageAcrossStructures",
                     expected_average_dice,
                     atol=dice_tolerance)

    # check output files/directories
    assert train_config.outputs_folder.is_dir()
    assert train_config.logs_folder.is_dir()

    # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default)
    assert (train_config.logs_folder / "Lightning").is_dir()
    assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1

    assert train_config.num_epochs == 2
    # Checkpoint folder
    assert train_config.checkpoint_folder.is_dir()
    actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
    assert len(
        actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}"
    assert (train_config.checkpoint_folder /
            LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
    assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
    assert (train_config.outputs_folder /
            STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()

    # Path visualization: There should be 3 slices for each of the 2 subjects
    sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER
    assert sampling_folder.is_dir()
    assert train_config.show_patch_sampling > 0
    assert len(list(sampling_folder.rglob(
        "*.png"))) == 3 * train_config.show_patch_sampling

    # # Test for saving of example images
    assert train_config.example_images_folder.is_dir(
    ) if train_config.store_dataset_sample else True
    example_files = list(train_config.example_images_folder.rglob("*.*"))
    assert len(example_files) == (3 * 2 *
                                  2 if train_config.store_dataset_sample else 0
                                  )  # images x epochs x patients