def test_rnn_classifier_via_config_1(use_combined_model: bool, imaging_feature_type: ImagingFeatureType, combine_hidden_state: bool, use_encoder_layer_norm: bool, use_mean_teacher_model: bool, test_output_dirs: OutputFolderForTests) -> None: """ Test if we can build a simple RNN model that only feeds off non-image features. This just tests the mechanics of training, but not if the model learned. """ logging_to_stdout() config = ToySequenceModel(use_combined_model, imaging_feature_type=imaging_feature_type, combine_hidden_states=combine_hidden_state, use_encoder_layer_norm=use_encoder_layer_norm, use_mean_teacher_model=use_mean_teacher_model, should_validate=False) config.use_mixed_precision = True config.set_output_to(test_output_dirs.root_dir) config.dataset_data_frame = _get_mock_sequence_dataset() # Patch the load_images function that will be called once we access a dataset item image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, SCAN_SIZE), segmentations=np.random.randint(0, 2, SCAN_SIZE)) with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg): model_train_unittest(config, dirs=test_output_dirs)
def test_run_model_with_invalid_trainer_arguments(test_output_dirs: OutputFolderForTests) -> None: """ Test if the trainer_arguments in a LightningContainer are passed to the trainer. """ container = DummyContainerWithInvalidTrainerArguments() with pytest.raises(Exception) as ex: model_train_unittest(config=None, output_folder=test_output_dirs, lightning_container=container) assert "no_such_argument" in str(ex)
def test_train_2d_classification_model(test_output_dirs: OutputFolderForTests, use_mixed_precision: bool) -> None: """ Test training and testing of 2d classification models. """ logging_to_stdout(logging.DEBUG) config = ClassificationModelForTesting2D() config.set_output_to(test_output_dirs.root_dir) # Train for 4 epochs, checkpoints at epochs 2 and 4 config.num_epochs = 4 config.use_mixed_precision = use_mixed_precision model_training_result, checkpoint_handler = model_train_unittest( config, dirs=test_output_dirs) assert model_training_result is not None expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05] expected_train_loss = [0.705931, 0.698664, 0.694489, 0.693151] expected_val_loss = [1.078517, 1.140510, 1.199026, 1.248595] actual_train_loss = model_training_result.get_metric( is_training=True, metric_type=MetricType.LOSS.value) actual_val_loss = model_training_result.get_metric( is_training=False, metric_type=MetricType.LOSS.value) actual_lr = model_training_result.get_metric( is_training=True, metric_type=MetricType.LEARNING_RATE.value) assert actual_train_loss == pytest.approx(expected_train_loss, abs=1e-6) assert actual_val_loss == pytest.approx(expected_val_loss, abs=1e-6) assert actual_lr == pytest.approx(expected_learning_rates, rel=1e-5) test_results = model_testing.model_test( config, ModelExecutionMode.TRAIN, checkpoint_handler=checkpoint_handler) assert isinstance(test_results, InferenceMetricsForClassification)
def test_non_image_encoder( test_output_dirs: OutputFolderForTests, hidden_layer_num_feature_channels: Optional[int]) -> None: """ Test if we can build a simple MLP model that only feeds off non-image features. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = _get_fake_dataset_contents() (dataset_folder / DATASET_CSV_FILE_NAME).write_text(dataset_contents) config = NonImageEncoder( should_validate=False, hidden_layer_num_feature_channels=hidden_layer_num_feature_channels) config.local_dataset = dataset_folder config.set_output_to(test_output_dirs.root_dir) config.max_batch_grad_cam = 1 config.validate() # run model training _, checkpoint_handler = model_train_unittest( config, output_folder=test_output_dirs) # run model inference runner = MLRunner(config) runner.setup() runner.model_inference_train_and_test( checkpoint_paths=checkpoint_handler.get_checkpoints_to_test()) assert config.get_total_number_of_non_imaging_features() == 18
def test_autosave_checkpoints(test_output_dirs: OutputFolderForTests, num_epochs: int) -> None: """ Tests that all autosave checkpoints are cleaned up after training. """ # Lightning does not overwrite checkpoints in-place. Rather, it writes "autosave.ckpt", # then "autosave-1.ckpt" and deletes "autosave.ckpt", then "autosave.ckpt" and deletes "autosave-v1.ckpt" # All those checkpoints should be cleaned up after training, only the best checkpoint should remain. config = DummyClassification() config.autosave_every_n_val_epochs = 1 config.set_output_to(test_output_dirs.root_dir) config.num_epochs = num_epochs model_train_unittest(config, output_folder=test_output_dirs) assert len(list(config.checkpoint_folder.glob("*.*"))) == 1 assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
def test_rnn_classifier_via_config_2(test_output_dirs: OutputFolderForTests) -> None: """ Test if we can build an RNN classifier that learns sequences, of the same kind as in test_rnn_classifier_toy_problem, but built via the config. """ expected_max_train_loss = 0.71 expected_max_val_loss = 0.71 num_sequences = 100 ml_util.set_random_seed(123) dataset_contents = "subject,index,feature,label\n" for subject in range(num_sequences): # Sequences have variable length sequence_length = np.random.choice([9, 10, 11, 12]) # Each sequence is a series of 0 and 1 inputs = np.random.choice([0, 1], size=(sequence_length,), p=[1. / 3, 2. / 3]) label = np.sum(inputs) > (sequence_length // 2) for i, value in enumerate(inputs.tolist()): dataset_contents += f"S{subject},{i},{value},{label}\n" logging_to_stdout() config = ToySequenceModel2(should_validate=False) config.num_epochs = 2 config.set_output_to(test_output_dirs.root_dir) config.dataset_data_frame = _get_mock_sequence_dataset(dataset_contents) results, _ = model_train_unittest(config, dirs=test_output_dirs) actual_train_loss = results.get_metric(is_training=True, metric_type=MetricType.LOSS.value)[-1] actual_val_loss = results.get_metric(is_training=False, metric_type=MetricType.LOSS.value)[-1] print(f"Training loss after {config.num_epochs} epochs: {actual_train_loss}") print(f"Validation loss after {config.num_epochs} epochs: {actual_val_loss}") assert actual_train_loss <= expected_max_train_loss, "Training loss too high" assert actual_val_loss <= expected_max_val_loss, "Validation loss too high"
def test_recover_training_mean_teacher_model( test_output_dirs: OutputFolderForTests) -> None: """ Tests that training can be recovered from a previous checkpoint. """ config = DummyClassification() config.mean_teacher_alpha = 0.999 config.autosave_every_n_val_epochs = 1 config.set_output_to(test_output_dirs.root_dir / "original") os.makedirs(str(config.outputs_folder)) original_checkpoint_folder = config.checkpoint_folder # First round of training config.num_epochs = 4 model_train_unittest(config, output_folder=test_output_dirs) assert len(list(config.checkpoint_folder.glob("*.*"))) == 1 assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file() # Restart training from previous run config.num_epochs = 3 config.set_output_to(test_output_dirs.root_dir / "recovered") os.makedirs(str(config.outputs_folder)) # make if seem like run recovery objects have been downloaded checkpoint_root = config.checkpoint_folder / "old_run" shutil.copytree(str(original_checkpoint_folder), str(checkpoint_root)) # Create a new checkpoint handler and set run_recovery to the copied checkpoints checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) checkpoint_handler.run_recovery = RunRecovery([checkpoint_root]) model_train_unittest(config, output_folder=test_output_dirs, checkpoint_handler=checkpoint_handler) # remove recovery checkpoints shutil.rmtree(checkpoint_root) assert len(list(config.checkpoint_folder.glob("*.ckpt"))) == 1
def test_recovery_e2e(test_output_dirs: OutputFolderForTests) -> None: """ Test restarting a training: Train a small model for 5 epochs, then continue training to epoch 10 from the results of the first training run. """ model_config = DummyClassification() model_config.set_output_to(test_output_dirs.root_dir) num_epochs_1 = 5 model_config.num_epochs = num_epochs_1 storing_logger_1, checkpoint_handler = model_train_unittest( model_config, output_folder=test_output_dirs) # Logger should have results for epochs 0..4 assert list(storing_logger_1.epochs) == list(range(num_epochs_1)) # Now restart the job, train to epoch 10 num_epochs_2 = 10 model_config.num_epochs = num_epochs_2 storing_logger_2, _ = model_train_unittest( model_config, output_folder=test_output_dirs, checkpoint_handler=checkpoint_handler) # Logger should have results only for epochs 5..9 assert list(storing_logger_2.epochs) == list( range(num_epochs_1, num_epochs_2))
def _test_model_train(output_dirs: OutputFolderForTests, image_channels: Any, ground_truth_ids: Any, no_mask_channel: bool = False) -> None: def _check_patch_centers(diagnostics_per_epoch: List[np.ndarray], should_equal: bool) -> None: patch_centers_epoch1 = diagnostics_per_epoch[0] assert len( diagnostics_per_epoch ) > 1, "Not enough data to check patch centers, need at least 2" for diagnostic in diagnostics_per_epoch[1:]: assert np.array_equal(patch_centers_epoch1, diagnostic) == should_equal def _check_voxel_count(results_per_epoch: List[Dict[str, float]], expected_voxel_count_per_epoch: List[float], prefix: str) -> None: assert len(results_per_epoch) == len(expected_voxel_count_per_epoch) for epoch, (results, voxel_count) in enumerate( zip(results_per_epoch, expected_voxel_count_per_epoch)): # In the test data, both structures "region" and "region_1" are read from the same nifti file, hence # their voxel counts must be identical. for structure in ["region", "region_1"]: assert results[f"{MetricType.VOXEL_COUNT.value}/{structure}"] == pytest.approx(voxel_count, abs=1e-2), \ f"{prefix} voxel count mismatch for '{structure}' epoch {epoch}" def _mean(a: List[float]) -> float: return sum(a) / len(a) def _mean_list(lists: List[List[float]]) -> List[float]: return list(map(_mean, lists)) logging_to_stdout(log_level=logging.DEBUG) train_config = DummyModel() train_config.local_dataset = base_path train_config.set_output_to(output_dirs.root_dir) train_config.image_channels = image_channels train_config.ground_truth_ids = ground_truth_ids train_config.mask_id = None if no_mask_channel else train_config.mask_id train_config.random_seed = 42 train_config.class_weights = [0.5, 0.25, 0.25] train_config.store_dataset_sample = no_mask_channel train_config.check_exclusive = False if machine_has_gpu: expected_train_losses = [0.4554231, 0.4550124] expected_val_losses = [0.4553894, 0.4553061] else: expected_train_losses = [0.4554231, 0.4550112] expected_val_losses = [0.4553893, 0.4553061] loss_absolute_tolerance = 1e-6 expected_learning_rates = [train_config.l_rate, 5.3589e-4] model_training_result, _ = model_train_unittest(train_config, output_folder=output_dirs) assert isinstance(model_training_result, StoringLogger) # Check that all metrics from the BatchTimeCallback are present # # TODO: re-enable once the BatchTimeCallback is fixed # for epoch, epoch_results in model_training_result.results_per_epoch.items(): # for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]: # for metric_type in [BatchTimeCallback.EPOCH_TIME, # BatchTimeCallback.BATCH_TIME + " avg", # BatchTimeCallback.BATCH_TIME + " max", # BatchTimeCallback.EXCESS_LOADING_TIME]: # expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type # assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}" # # Excess loading time can be zero because that only measure batches over the threshold # if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME: # value = epoch_results[expected] # assert isinstance(value, float) # assert value > 0.0, f"Time for {expected} should be > 0" actual_train_losses = model_training_result.get_train_metric( MetricType.LOSS.value) actual_val_losses = model_training_result.get_val_metric( MetricType.LOSS.value) print("actual_train_losses = {}".format(actual_train_losses)) print("actual_val_losses = {}".format(actual_val_losses)) def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None: actual = model_training_result.get_train_metric(metric) assert np.allclose( actual, expected, **kwargs ), f"Mismatch for {metric}: Got {actual}, expected {expected}" # check to make sure training batches are NOT all the same across epochs _check_patch_centers(model_training_result.train_diagnostics, should_equal=False) # check to make sure validation batches are all the same across epochs _check_patch_centers(model_training_result.val_diagnostics, should_equal=True) assert_all_close(MetricType.SUBJECT_COUNT.value, [3.0, 3.0]) assert_all_close(MetricType.LEARNING_RATE.value, expected_learning_rates, rtol=1e-6) if is_windows(): # Randomization comes out slightly different on Windows. Skip the rest of the detailed checks. return # Simple regression test: Voxel counts should be the same in both epochs on the validation set, # and be the same across 'region' and 'region_1' because they derive from the same Nifti files. # The following values are read off directly from the results of compute_dice_across_patches in the training loop # This checks that averages are computed correctly, and that metric computers are reset after each epoch. train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]] val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]] _check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train") _check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val") assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance), "Train losses" assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance), "Val losses" # Check that the metric we track for Hyperdrive runs is actually written. assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX) tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):] for val_result in model_training_result.val_results_per_epoch(): assert tracked_metric in val_result # The following values are read off directly from the results of compute_dice_across_patches in the # training loop. Results are slightly different for GPU, hence use a larger tolerance there. dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4 train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]] train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]] # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when # failing here, the losses match up to the expected tolerance. assert_all_close("Dice/region", _mean_list(train_dice_region), atol=dice_tolerance) assert_all_close("Dice/region_1", _mean_list(train_dice_region1), atol=dice_tolerance) expected_average_dice = [ _mean(train_dice_region[i] + train_dice_region1[i]) # type: ignore for i in range(len(train_dice_region)) ] assert_all_close("Dice/AverageAcrossStructures", expected_average_dice, atol=dice_tolerance) # check output files/directories assert train_config.outputs_folder.is_dir() assert train_config.logs_folder.is_dir() # Tensorboard event files go into a Lightning subfolder (Pytorch Lightning default) assert (train_config.logs_folder / "Lightning").is_dir() assert len([(train_config.logs_folder / "Lightning").glob("events*")]) == 1 assert train_config.num_epochs == 2 # Checkpoint folder assert train_config.checkpoint_folder.is_dir() actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt")) assert len( actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}" assert (train_config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file() assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file() assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file() # Path visualization: There should be 3 slices for each of the 2 subjects sampling_folder = train_config.outputs_folder / PATCH_SAMPLING_FOLDER assert sampling_folder.is_dir() assert train_config.show_patch_sampling > 0 assert len(list(sampling_folder.rglob( "*.png"))) == 3 * train_config.show_patch_sampling # # Test for saving of example images assert train_config.example_images_folder.is_dir( ) if train_config.store_dataset_sample else True example_files = list(train_config.example_images_folder.rglob("*.*")) assert len(example_files) == (3 * 2 * 2 if train_config.store_dataset_sample else 0 ) # images x epochs x patients
def test_recover_testing_from_run_recovery( mean_teacher_model: bool, test_output_dirs: OutputFolderForTests) -> None: """ Checks that inference results are the same whether from a checkpoint in the same run, from a run recovery or from a local_weights_path param. """ # Train for 4 epochs config = DummyClassification() if mean_teacher_model: config.mean_teacher_alpha = 0.999 config.set_output_to(test_output_dirs.root_dir / "original") os.makedirs(str(config.outputs_folder)) train_results, checkpoint_handler = model_train_unittest( config, output_folder=test_output_dirs) assert len(train_results.train_results_per_epoch()) == config.num_epochs # Run inference on this test_results = model_test( config=config, data_split=ModelExecutionMode.TEST, checkpoint_paths=checkpoint_handler.get_checkpoints_to_test()) assert isinstance(test_results, InferenceMetricsForClassification) # Mimic using a run recovery and see if it is the same config_run_recovery = DummyClassification() if mean_teacher_model: config_run_recovery.mean_teacher_alpha = 0.999 config_run_recovery.set_output_to(test_output_dirs.root_dir / "run_recovery") os.makedirs(str(config_run_recovery.outputs_folder)) checkpoint_handler_run_recovery = get_default_checkpoint_handler( model_config=config_run_recovery, project_root=test_output_dirs.root_dir) # make it seem like run recovery objects have been downloaded checkpoint_root = config_run_recovery.checkpoint_folder / "recovered" shutil.copytree(str(config.checkpoint_folder), str(checkpoint_root)) checkpoint_handler_run_recovery.run_recovery = RunRecovery( [checkpoint_root]) test_results_run_recovery = model_test( config_run_recovery, data_split=ModelExecutionMode.TEST, checkpoint_paths=checkpoint_handler_run_recovery. get_checkpoints_to_test()) assert isinstance(test_results_run_recovery, InferenceMetricsForClassification) assert test_results.metrics.values()[MetricType.CROSS_ENTROPY.value] == \ test_results_run_recovery.metrics.values()[MetricType.CROSS_ENTROPY.value] # Run inference with the local checkpoints config_local_weights = DummyClassification() if mean_teacher_model: config_local_weights.mean_teacher_alpha = 0.999 config_local_weights.set_output_to(test_output_dirs.root_dir / "local_weights_path") os.makedirs(str(config_local_weights.outputs_folder)) local_weights_path = test_output_dirs.root_dir / "local_weights_file.pth" shutil.copyfile( str(config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX), local_weights_path) config_local_weights.local_weights_path = [local_weights_path] checkpoint_handler_local_weights = get_default_checkpoint_handler( model_config=config_local_weights, project_root=test_output_dirs.root_dir) checkpoint_handler_local_weights.download_recovery_checkpoints_or_weights() test_results_local_weights = model_test( config_local_weights, data_split=ModelExecutionMode.TEST, checkpoint_paths=checkpoint_handler_local_weights. get_checkpoints_to_test()) assert isinstance(test_results_local_weights, InferenceMetricsForClassification) assert test_results.metrics.values()[MetricType.CROSS_ENTROPY.value] == \ test_results_local_weights.metrics.values()[MetricType.CROSS_ENTROPY.value]
def test_image_encoder( test_output_dirs: OutputFolderForTests, encode_channels_jointly: bool, use_non_imaging_features: bool, kernel_size_per_encoding_block: Optional[Union[TupleInt3, List[TupleInt3]]], stride_size_per_encoding_block: Optional[Union[TupleInt3, List[TupleInt3]]], reduction_factor: float, expected_num_reduced_features: int, aggregation_type: AggregationType) -> None: """ Test if the image encoder networks can be trained without errors (including GradCam computation and data augmentation). """ logging_to_stdout() set_random_seed(0) dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) scan_size = (6, 64, 60) scan_files: List[str] = [] for s in range(4): random_scan = np.random.uniform(0, 1, scan_size) scan_file_name = f"scan{s + 1}{NumpyFile.NUMPY.value}" np.save(str(dataset_folder / scan_file_name), random_scan) scan_files.append(scan_file_name) dataset_contents = """subject,channel,path,label,numerical1,numerical2,categorical1,categorical2 S1,week0,scan1.npy,,1,10,Male,Val1 S1,week1,scan2.npy,True,2,20,Female,Val2 S2,week0,scan3.npy,,3,30,Female,Val3 S2,week1,scan4.npy,False,4,40,Female,Val1 S3,week0,scan1.npy,,5,50,Male,Val2 S3,week1,scan3.npy,True,6,60,Male,Val2 """ (dataset_folder / "dataset.csv").write_text(dataset_contents) numerical_columns = ["numerical1", "numerical2" ] if use_non_imaging_features else [] categorical_columns = ["categorical1", "categorical2" ] if use_non_imaging_features else [] non_image_feature_channels = get_non_image_features_dict(default_channels=["week1", "week0"], specific_channels={"categorical2": ["week1"]}) \ if use_non_imaging_features else {} config_for_dataset = ScalarModelBase( local_dataset=dataset_folder, image_channels=["week0", "week1"], image_file_column="path", label_channels=["week1"], label_value_column="label", non_image_feature_channels=non_image_feature_channels, numerical_columns=numerical_columns, categorical_columns=categorical_columns, should_validate=False) config_for_dataset.read_dataset_into_dataframe_and_pre_process() dataset = ScalarDataset( config_for_dataset, sample_transform=ScalarItemAugmentation( ImageTransformationPipeline( [RandomAffine(10), ColorJitter(0.2)], use_different_transformation_per_channel=True))) assert len(dataset) == 3 config = ImageEncoder( encode_channels_jointly=encode_channels_jointly, should_validate=False, numerical_columns=numerical_columns, categorical_columns=categorical_columns, non_image_feature_channels=non_image_feature_channels, categorical_feature_encoder=config_for_dataset. categorical_feature_encoder, encoder_dimensionality_reduction_factor=reduction_factor, aggregation_type=aggregation_type, scan_size=(6, 64, 60)) if kernel_size_per_encoding_block: config.kernel_size_per_encoding_block = kernel_size_per_encoding_block if stride_size_per_encoding_block: config.stride_size_per_encoding_block = stride_size_per_encoding_block config.set_output_to(test_output_dirs.root_dir) config.max_batch_grad_cam = 1 model = create_model_with_temperature_scaling(config) input_size: List[Tuple] = [(len(config.image_channels), *scan_size)] if use_non_imaging_features: input_size.append( (config.get_total_number_of_non_imaging_features(), )) # Original number output channels (unreduced) is # num initial channel * (num encoder block - 1) = 4 * (3-1) = 8 if encode_channels_jointly: # reduced_num_channels + num_non_img_features assert model.final_num_feature_channels == expected_num_reduced_features + \ config.get_total_number_of_non_imaging_features() else: # num_img_channels * reduced_num_channels + num_non_img_features assert model.final_num_feature_channels == len(config.image_channels) * expected_num_reduced_features + \ config.get_total_number_of_non_imaging_features() summarizer = ModelSummary(model) summarizer.generate_summary(input_sizes=input_size) config.local_dataset = dataset_folder config.validate() model_train_unittest(config, dirs=test_output_dirs)
def test_train_classification_model( class_name: str, test_output_dirs: OutputFolderForTests) -> None: """ Test training and testing of classification models, asserting on the individual results from training and testing. Expected test results are stored for GPU with and without mixed precision. """ logging_to_stdout(logging.DEBUG) config = ClassificationModelForTesting() config.class_names = config.target_names = [class_name] config.set_output_to(test_output_dirs.root_dir) # Train for 4 epochs, checkpoints at epochs 2 and 4 config.num_epochs = 4 model_training_result, checkpoint_handler = model_train_unittest( config, dirs=test_output_dirs) assert model_training_result is not None expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05] expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167] expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952] # Ensure that all metrics are computed on both training and validation set train_results_per_epoch = model_training_result.train_results_per_epoch() val_results_per_epoch = model_training_result.val_results_per_epoch() assert len(train_results_per_epoch) == config.num_epochs assert len(val_results_per_epoch) == config.num_epochs assert len(train_results_per_epoch[0]) >= 11 assert len(val_results_per_epoch[0]) >= 11 for metric in [ MetricType.ACCURACY_AT_THRESHOLD_05, MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD, MetricType.AREA_UNDER_PR_CURVE, MetricType.AREA_UNDER_ROC_CURVE, MetricType.CROSS_ENTROPY, MetricType.LOSS, MetricType.SECONDS_PER_BATCH, MetricType.SECONDS_PER_EPOCH, MetricType.SUBJECT_COUNT ]: assert metric.value in train_results_per_epoch[ 0], f"{metric.value} not in training" assert metric.value in val_results_per_epoch[ 0], f"{metric.value} not in validation" actual_train_loss = model_training_result.get_metric( is_training=True, metric_type=MetricType.LOSS.value) actual_val_loss = model_training_result.get_metric( is_training=False, metric_type=MetricType.LOSS.value) actual_lr = model_training_result.get_metric( is_training=True, metric_type=MetricType.LEARNING_RATE.value) assert actual_train_loss == pytest.approx(expected_train_loss, abs=1e-6), "Training loss" assert actual_val_loss == pytest.approx(expected_val_loss, abs=1e-6), "Validation loss" assert actual_lr == pytest.approx(expected_learning_rates, rel=1e-5), "Learning rates" test_results = model_testing.model_test( config, ModelExecutionMode.TRAIN, checkpoint_handler=checkpoint_handler) assert isinstance(test_results, InferenceMetricsForClassification) expected_metrics = [0.636085, 0.735952] assert test_results.metrics.values(class_name)[MetricType.CROSS_ENTROPY.value] == \ pytest.approx(expected_metrics, abs=1e-5) # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here # we want to mostly assert that the files look reasonable if machine_has_gpu: return # Check epoch_metrics.csv epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME # Auto-format will break the long header line, hence the strange way of writing it! expected_epoch_metrics = \ f"{LoggingColumns.Loss.value},{LoggingColumns.CrossEntropy.value}," \ f"{LoggingColumns.AccuracyAtThreshold05.value},{LoggingColumns.LearningRate.value}," + \ f"{LoggingColumns.AreaUnderRocCurve.value}," \ f"{LoggingColumns.AreaUnderPRCurve.value}," \ f"{LoggingColumns.AccuracyAtOptimalThreshold.value}," \ f"{LoggingColumns.FalsePositiveRateAtOptimalThreshold.value}," \ f"{LoggingColumns.FalseNegativeRateAtOptimalThreshold.value}," \ f"{LoggingColumns.OptimalThreshold.value}," \ f"{LoggingColumns.SubjectCount.value},{LoggingColumns.Epoch.value}," \ f"{LoggingColumns.CrossValidationSplitIndex.value}\n" + \ """0.6866141557693481,0.6866141557693481,0.5,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1 0.6864652633666992,0.6864652633666992,0.5,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1 0.6863163113594055,0.6863162517547607,0.5,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1 0.6861673593521118,0.6861673593521118,0.5,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1 """ check_log_file(epoch_metrics_path, expected_epoch_metrics, ignore_columns=[]) # Check metrics.csv: This contains the per-subject per-epoch model outputs # Randomization comes out slightly different on Windows, hence only execute the test on Linux if common_util.is_windows(): return metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME metrics_expected = \ f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index 0,S2,{class_name},0.529514,1,Train,-1 0,S4,{class_name},0.521659,0,Train,-1 1,S4,{class_name},0.521482,0,Train,-1 1,S2,{class_name},0.529475,1,Train,-1 2,S4,{class_name},0.521305,0,Train,-1 2,S2,{class_name},0.529437,1,Train,-1 3,S2,{class_name},0.529399,1,Train,-1 3,S4,{class_name},0.521128,0,Train,-1 """ check_log_file(metrics_path, metrics_expected, ignore_columns=[]) # Check log METRICS_FILE_NAME inside of the folder best_validation_epoch/Train, which is written when we run model_test. # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here. inference_metrics_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \ SUBJECT_METRICS_FILE_NAME inference_metrics_expected = \ f"""prediction_target,subject,model_output,label,epoch,cross_validation_split_index,data_split {class_name},S2,0.5293986201286316,1.0,{BEST_EPOCH_FOLDER_NAME},-1,Train {class_name},S4,0.5211275815963745,0.0,{BEST_EPOCH_FOLDER_NAME},-1,Train """ check_log_file(inference_metrics_path, inference_metrics_expected, ignore_columns=[]) inference_model_output_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \ model_testing.MODEL_OUTPUT_CSV inference_model_output_expected = \ f"""subject,prediction_target,label,model_output,cross_validation_split_index S2,{class_name},1.000000,0.529399,-1 S4,{class_name},0.000000,0.521128,-1""" check_log_file(inference_model_output_path, inference_model_output_expected, ignore_columns=[])
def test_train_classification_multilabel_model( test_output_dirs: OutputFolderForTests) -> None: """ Test training and testing of classification models, asserting on the individual results from training and testing. Expected test results are stored for GPU with and without mixed precision. """ logging_to_stdout(logging.DEBUG) config = DummyMulticlassClassification() config.set_output_to(test_output_dirs.root_dir) # Train for 4 epochs, checkpoints at epochs 2 and 4 config.num_epochs = 4 model_training_result, checkpoint_handler = model_train_unittest( config, dirs=test_output_dirs) assert model_training_result is not None expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05] expected_train_loss = [ 0.699870228767395, 0.6239662170410156, 0.551329493522644, 0.4825132489204407 ] expected_val_loss = [ 0.6299371719360352, 0.5546272993087769, 0.4843321740627289, 0.41909298300743103 ] # Ensure that all metrics are computed on both training and validation set train_results_per_epoch = model_training_result.train_results_per_epoch() val_results_per_epoch = model_training_result.val_results_per_epoch() assert len(train_results_per_epoch) == config.num_epochs assert len(val_results_per_epoch) == config.num_epochs assert len(train_results_per_epoch[0]) >= 11 assert len(val_results_per_epoch[0]) >= 11 for class_name in config.class_names: for metric in [ MetricType.ACCURACY_AT_THRESHOLD_05, MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD, MetricType.AREA_UNDER_PR_CURVE, MetricType.AREA_UNDER_ROC_CURVE, MetricType.CROSS_ENTROPY ]: assert f'{metric.value}/{class_name}' in train_results_per_epoch[ 0], f"{metric.value} not in training" assert f'{metric.value}/{class_name}' in val_results_per_epoch[ 0], f"{metric.value} not in validation" for metric in [ MetricType.LOSS, MetricType.SECONDS_PER_EPOCH, MetricType.SUBJECT_COUNT ]: assert metric.value in train_results_per_epoch[ 0], f"{metric.value} not in training" assert metric.value in val_results_per_epoch[ 0], f"{metric.value} not in validation" actual_train_loss = model_training_result.get_metric( is_training=True, metric_type=MetricType.LOSS.value) actual_val_loss = model_training_result.get_metric( is_training=False, metric_type=MetricType.LOSS.value) actual_lr = model_training_result.get_metric( is_training=True, metric_type=MetricType.LEARNING_RATE.value) assert actual_train_loss == pytest.approx(expected_train_loss, abs=1e-6), "Training loss" assert actual_val_loss == pytest.approx(expected_val_loss, abs=1e-6), "Validation loss" assert actual_lr == pytest.approx(expected_learning_rates, rel=1e-5), "Learning rates" test_results = model_testing.model_test( config, ModelExecutionMode.TRAIN, checkpoint_handler=checkpoint_handler) assert isinstance(test_results, InferenceMetricsForClassification) expected_metrics = { MetricType.CROSS_ENTROPY: [1.3996, 5.2966, 1.4020, 0.3553, 0.6908], MetricType.ACCURACY_AT_THRESHOLD_05: [0.0000, 0.0000, 0.0000, 1.0000, 1.0000] } for i, class_name in enumerate(config.class_names): for metric in expected_metrics.keys(): assert expected_metrics[metric][i] == pytest.approx( test_results.metrics.get_single_metric(metric_name=metric, hue=class_name), 1e-4) def get_epoch_path(mode: ModelExecutionMode) -> Path: p = get_best_epoch_results_path(mode=mode) return config.outputs_folder / p / SUBJECT_METRICS_FILE_NAME path_to_best_epoch_train = get_epoch_path(ModelExecutionMode.TRAIN) path_to_best_epoch_val = get_epoch_path(ModelExecutionMode.VAL) path_to_best_epoch_test = get_epoch_path(ModelExecutionMode.TEST) generate_classification_notebook( result_notebook=config.outputs_folder / get_ipynb_report_name(config.model_category.value), config=config, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) assert (config.outputs_folder / get_html_report_name(config.model_category.value)).exists() report_name_multilabel = f"{config.model_category.value}_multilabel" generate_classification_multilabel_notebook( result_notebook=config.outputs_folder / get_ipynb_report_name(report_name_multilabel), config=config, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) assert (config.outputs_folder / get_html_report_name(report_name_multilabel)).exists()