def _test_model_train(output_dirs: TestOutputDirectories, image_channels: Any, ground_truth_ids: Any, no_mask_channel: bool = False) -> None: def _check_patch_centers(epoch_results: List[MetricsDict], should_equal: bool) -> None: diagnostics_per_epoch = [ m.diagnostics[MetricType.PATCH_CENTER.value] for m in epoch_results ] patch_centers_epoch1 = diagnostics_per_epoch[0] for diagnostic in diagnostics_per_epoch[1:]: assert np.array_equal(patch_centers_epoch1, diagnostic) == should_equal train_config = DummyModel() train_config.local_dataset = base_path train_config.set_output_to(output_dirs.root_dir) train_config.image_channels = image_channels train_config.ground_truth_ids = ground_truth_ids train_config.mask_id = None if no_mask_channel else train_config.mask_id train_config.random_seed = 42 train_config.class_weights = [0.5, 0.25, 0.25] train_config.store_dataset_sample = True expected_train_losses = [0.455538, 0.455213] expected_val_losses = [0.455190, 0.455139] expected_stats = "Epoch\tLearningRate\tTrainLoss\tTrainDice\tValLoss\tValDice\n" \ "1\t1.00e-03\t0.456\t0.242\t0.455\t0.000\n" \ "2\t5.36e-04\t0.455\t0.247\t0.455\t0.000" expected_learning_rates = [[train_config.l_rate], [5.3589e-4]] loss_absolute_tolerance = 1e-3 model_training_result = model_training.model_train(train_config) assert isinstance(model_training_result, ModelTrainingResults) # check to make sure training batches are NOT all the same across epochs _check_patch_centers(model_training_result.train_results_per_epoch, should_equal=False) # check to make sure validation batches are all the same across epochs _check_patch_centers(model_training_result.val_results_per_epoch, should_equal=True) assert isinstance(model_training_result.train_results_per_epoch[0], MetricsDict) actual_train_losses = [ m.get_single_metric(MetricType.LOSS) for m in model_training_result.train_results_per_epoch ] actual_val_losses = [ m.get_single_metric(MetricType.LOSS) for m in model_training_result.val_results_per_epoch ] print("actual_train_losses = {}".format(actual_train_losses)) print("actual_val_losses = {}".format(actual_val_losses)) assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance) assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance) assert np.allclose(model_training_result.learning_rates_per_epoch, expected_learning_rates, rtol=1e-6) # check output files/directories assert train_config.outputs_folder.is_dir() assert train_config.logs_folder.is_dir() # The train and val folder should contain Tensorflow event files assert (train_config.logs_folder / "train").is_dir() assert (train_config.logs_folder / "val").is_dir() assert len([(train_config.logs_folder / "train").glob("*")]) == 1 assert len([(train_config.logs_folder / "val").glob("*")]) == 1 # Checkpoint folder # With these settings, we should see a checkpoint only at epoch 2: # That's the last epoch, and there should always be checkpoint at the last epoch) assert train_config.save_start_epoch == 1 assert train_config.save_step_epochs == 100 assert train_config.num_epochs == 2 assert os.path.isdir(train_config.checkpoint_folder) assert os.path.isfile( os.path.join(train_config.checkpoint_folder, "2" + CHECKPOINT_FILE_SUFFIX)) assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file() assert_file_contents(train_config.outputs_folder / TRAIN_STATS_FILE, expected_stats) # Test for saving of example images assert os.path.isdir(train_config.example_images_folder) example_files = os.listdir(train_config.example_images_folder) assert len(example_files) == 3 * 2
def _test_model_train(output_dirs: OutputFolderForTests, image_channels: Any, ground_truth_ids: Any, no_mask_channel: bool = False) -> None: def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray], should_equal: bool) -> None: patch_centers_epoch1 = diagnostics_per_epoch[0] assert len( diagnostics_per_epoch ) > 1, "Not enough data to check patch centers, need at least 2" for diagnostic in diagnostics_per_epoch[1:]: assert np.array_equal(patch_centers_epoch1, diagnostic) == should_equal def _check_voxel_count(results_per_epoch: List[Dict[str, float]], expected_voxel_count_per_epoch: List[float], prefix: str) -> None: assert len(results_per_epoch) == len(expected_voxel_count_per_epoch) for epoch, (results, voxel_count) in enumerate( zip(results_per_epoch, expected_voxel_count_per_epoch)): # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence # their voxel counts must be identical. for structure in ["region", "region_1"]: assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \ f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}" def _mean(a: List[float]) -> float: return sum(a) / len(a) def _mean_list(lists: List[List[float]]) -> List[float]: return list(map(_mean, lists)) logging_to_stdout(log_level=logging.DEBUG) train_config = DummyModel() train_config.local_dataset = base_path train_config.set_output_to(output_dirs.root_dir) train_config.image_channels = image_channels train_config.ground_truth_ids = ground_truth_ids train_config.mask_id = None if no_mask_channel else train_config.mask_id train_config.random_seed = 42 train_config.class_weights = [0.5, 0.25, 0.25] train_config.store_dataset_sample = True train_config.recovery_checkpoint_save_interval = 1 if machine_has_gpu: expected_train_losses = [0.4553468, 0.454904] expected_val_losses = [0.4553881, 0.4553041] else: expected_train_losses = [0.4553469, 0.4548947] expected_val_losses = [0.4553880, 0.4553041] loss_absolute_tolerance = 1e-6 expected_learning_rates = [train_config.l_rate, 5.3589e-4] checkpoint_handler = get_default_checkpoint_handler( model_config=train_config, project_root=Path(output_dirs.root_dir)) model_training_result = model_training.model_train( train_config, checkpoint_handler=checkpoint_handler) assert isinstance(model_training_result, ModelTrainingResults) def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None: actual = model_training_result.get_training_metric(metric) assert np.allclose( actual, expected, **kwargs ), f"Mismatch for {metric}: Got {actual}, expected {expected}" # check to make sure training batches are NOT all the same across epochs _check_patch_centers(model_training_result.train_diagnostics, should_equal=False) # check to make sure validation batches are all the same across epochs _check_patch_centers(model_training_result.val_diagnostics, should_equal=True) assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0]) assert_all_close(MetricType.LEARNING_RATE.value, expected_learning_rates, rtol=1e-6) if is_windows(): # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks. return # Simple regression test: Voxel counts should be the same in both epochs on the validation set, # and be the same across 'region' and 'region_1' because they derive from the same Nifti files. # The following values are read off directly from the results of compute_dice_across_patches in the training loop # This checks that averages are computed correctly, and that metric computers are reset after each epoch. train_voxels = [[83092.0, 83212.0, 82946.0], [83000.0, 82881.0, 83309.0]] val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]] _check_voxel_count(model_training_result.train_results_per_epoch, _mean_list(train_voxels), "Train") _check_voxel_count(model_training_result.val_results_per_epoch, _mean_list(val_voxels), "Val") actual_train_losses = model_training_result.get_training_metric( MetricType.LOSS.value) actual_val_losses = model_training_result.get_validation_metric( MetricType.LOSS.value) print("actual_train_losses = {}".format(actual_train_losses)) print("actual_val_losses = {}".format(actual_val_losses)) assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance), "Train losses" assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance), "Val losses" # Check that the metric we track for Hyperdrive runs is actually written. assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX) tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):] for val_result in model_training_result.val_results_per_epoch: assert tracked_metric in val_result # The following values are read off directly from the results of compute_dice_across_patches in the # training loop. Results are slightly different for CPU, hence use a larger tolerance there. dice_tolerance = 1e-4 if machine_has_gpu else 4.5e-4 train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]] train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]] # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when # failing here, the losses match up to the expected tolerance. assert_all_close("Dice/region", _mean_list(train_dice_region), atol=dice_tolerance) assert_all_close("Dice/region_1", _mean_list(train_dice_region1), atol=dice_tolerance) expected_average_dice = [ _mean(train_dice_region[i] + train_dice_region1[i]) # type: ignore for i in range(len(train_dice_region)) ] assert_all_close("Dice/AverageAcrossStructures", expected_average_dice, atol=dice_tolerance) # check output files/directories assert train_config.outputs_folder.is_dir() assert train_config.logs_folder.is_dir() # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default) assert (train_config.logs_folder / "Lightning").is_dir() assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1 assert train_config.num_epochs == 2 # Checkpoint folder assert train_config.checkpoint_folder.is_dir() actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt")) assert len( actual_checkpoints) == 2, f"Actual checkpoints: {actual_checkpoints}" assert (train_config.checkpoint_folder / RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file() assert (train_config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file() assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file() # Path visualization: There should be 3 slices for each of the 2 subjects sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER assert sampling_folder.is_dir() assert train_config.show_patch_sampling > 0 assert len(list(sampling_folder.rglob( "*.png"))) == 3 * train_config.show_patch_sampling # Time per epoch: Test that we have all these times logged. model_training_result.get_training_metric( MetricType.SECONDS_PER_EPOCH.value) model_training_result.get_validation_metric( MetricType.SECONDS_PER_EPOCH.value) model_training_result.get_validation_metric( MetricType.SECONDS_PER_BATCH.value) model_training_result.get_training_metric( MetricType.SECONDS_PER_BATCH.value)
def _test_model_train(output_dirs: OutputFolderForTests, image_channels: Any, ground_truth_ids: Any, no_mask_channel: bool = False) -> None: def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray], should_equal: bool) -> None: patch_centers_epoch1 = diagnostics_per_epoch[0] assert len( diagnostics_per_epoch ) > 1, "Not enough data to check patch centers, need at least 2" for diagnostic in diagnostics_per_epoch[1:]: assert np.array_equal(patch_centers_epoch1, diagnostic) == should_equal def _check_voxel_count(results_per_epoch: List[Dict[str, float]], expected_voxel_count_per_epoch: List[float], prefix: str) -> None: assert len(results_per_epoch) == len(expected_voxel_count_per_epoch) for epoch, (results, voxel_count) in enumerate( zip(results_per_epoch, expected_voxel_count_per_epoch)): # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence # their voxel counts must be identical. for structure in ["region", "region_1"]: assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \ f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}" def _mean(a: List[float]) -> float: return sum(a) / len(a) def _mean_list(lists: List[List[float]]) -> List[float]: return list(map(_mean, lists)) logging_to_stdout(log_level=logging.DEBUG) train_config = DummyModel() train_config.local_dataset = base_path train_config.set_output_to(output_dirs.root_dir) train_config.image_channels = image_channels train_config.ground_truth_ids = ground_truth_ids train_config.mask_id = None if no_mask_channel else train_config.mask_id train_config.random_seed = 42 train_config.class_weights = [0.5, 0.25, 0.25] train_config.store_dataset_sample = no_mask_channel train_config.check_exclusive = False if machine_has_gpu: expected_train_losses = [0.4554231, 0.4550124] expected_val_losses = [0.4553894, 0.4553061] else: expected_train_losses = [0.4554231, 0.4550112] expected_val_losses = [0.4553893, 0.4553061] loss_absolute_tolerance = 1e-6 expected_learning_rates = [train_config.l_rate, 5.3589e-4] model_training_result, _ = model_train_unittest(train_config, output_folder=output_dirs) assert isinstance(model_training_result, StoringLogger) # Check that all metrics from the BatchTimeCallback are present # # TODO: re-enable once the BatchTimeCallback is fixed # for epoch, epoch_results in model_training_result.results_per_epoch.items(): # for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]: # for metric_type in [BatchTimeCallback.EPOCH_TIME, # BatchTimeCallback.BATCH_TIME + " avg", # BatchTimeCallback.BATCH_TIME + " max", # BatchTimeCallback.EXCESS_LOADING_TIME]: # expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type # assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}" # # Excess loading time can be zero because that only measure batches over the threshold # if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME: # value = epoch_results[expected] # assert isinstance(value, float) # assert value > 0.0, f"Time for {expected} should be > 0" actual_train_losses = model_training_result.get_train_metric( MetricType.LOSS.value) actual_val_losses = model_training_result.get_val_metric( MetricType.LOSS.value) print("actual_train_losses = {}".format(actual_train_losses)) print("actual_val_losses = {}".format(actual_val_losses)) def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None: actual = model_training_result.get_train_metric(metric) assert np.allclose( actual, expected, **kwargs ), f"Mismatch for {metric}: Got {actual}, expected {expected}" # check to make sure training batches are NOT all the same across epochs _check_patch_centers(model_training_result.train_diagnostics, should_equal=False) # check to make sure validation batches are all the same across epochs _check_patch_centers(model_training_result.val_diagnostics, should_equal=True) assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0]) assert_all_close(MetricType.LEARNING_RATE.value, expected_learning_rates, rtol=1e-6) if is_windows(): # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks. return # Simple regression test: Voxel counts should be the same in both epochs on the validation set, # and be the same across 'region' and 'region_1' because they derive from the same Nifti files. # The following values are read off directly from the results of compute_dice_across_patches in the training loop # This checks that averages are computed correctly, and that metric computers are reset after each epoch. train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]] val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]] _check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train") _check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val") assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance), "Train losses" assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance), "Val losses" # Check that the metric we track for Hyperdrive runs is actually written. assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX) tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):] for val_result in model_training_result.val_results_per_epoch(): assert tracked_metric in val_result # The following values are read off directly from the results of compute_dice_across_patches in the # training loop. Results are slightly different for GPU, hence use a larger tolerance there. dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4 train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]] train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]] # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when # failing here, the losses match up to the expected tolerance. assert_all_close("Dice/region", _mean_list(train_dice_region), atol=dice_tolerance) assert_all_close("Dice/region_1", _mean_list(train_dice_region1), atol=dice_tolerance) expected_average_dice = [ _mean(train_dice_region[i] + train_dice_region1[i]) # type: ignore for i in range(len(train_dice_region)) ] assert_all_close("Dice/AverageAcrossStructures", expected_average_dice, atol=dice_tolerance) # check output files/directories assert train_config.outputs_folder.is_dir() assert train_config.logs_folder.is_dir() # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default) assert (train_config.logs_folder / "Lightning").is_dir() assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1 assert train_config.num_epochs == 2 # Checkpoint folder assert train_config.checkpoint_folder.is_dir() actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt")) assert len( actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}" assert (train_config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file() assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file() # Path visualization: There should be 3 slices for each of the 2 subjects sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER assert sampling_folder.is_dir() assert train_config.show_patch_sampling > 0 assert len(list(sampling_folder.rglob( "*.png"))) == 3 * train_config.show_patch_sampling # # Test for saving of example images assert train_config.example_images_folder.is_dir( ) if train_config.store_dataset_sample else True example_files = list(train_config.example_images_folder.rglob("*.*")) assert len(example_files) == (3 * 2 * 2 if train_config.store_dataset_sample else 0 ) # images x epochs x patients