def load_image(path: PathOrString, image_type: Optional[Type] = float) -> ImageWithHeader: """ Loads an image with extension numpy or nifti For HDF5 path suffix For images |<dataset_name>|<channel index> For segmentation binary |<dataset_name>|<channel index> For segmentation multimap |<dataset_name>|<channel index>|<multimap value> The expected dimensions to be (channel, Z, Y, X) :param path: The path to the file :param image_type: The type of the image """ SEPARATOR = '|' if is_nifti_file_path(path): return load_nifti_image(path, image_type) elif is_numpy_file_path(path): image = load_numpy_image(path, image_type) header = get_unit_image_header() return ImageWithHeader(image, header) elif SEPARATOR in str(path): hdf5_path_split_by_colon = str(path).split(SEPARATOR) if len(hdf5_path_split_by_colon) == 4: # segmentation multimap h5_path = hdf5_path_split_by_colon[0] dataset = hdf5_path_split_by_colon[1] channel = int(hdf5_path_split_by_colon[2]) segmentation_id = int(hdf5_path_split_by_colon[3]) image = load_hdf5_dataset_from_file( Path(h5_path), dataset)[channel] == segmentation_id # create mask header = get_unit_image_header() return ImageWithHeader(image, header) elif len(hdf5_path_split_by_colon) == 3: h5_path = hdf5_path_split_by_colon[0] dataset = hdf5_path_split_by_colon[1] channel = int(hdf5_path_split_by_colon[2]) image = load_hdf5_dataset_from_file(Path(h5_path), dataset)[channel] header = get_unit_image_header() return ImageWithHeader(image, header) elif is_png(path): import imageio image = imageio.imread(path).astype(np.float) header = get_unit_image_header() return ImageWithHeader(image, header) raise ValueError(f"Invalid file type {path}")
def test_register_and_score_model(test_output_dirs: OutputFolderForTests) -> None: """ End-to-end test which ensures the scoring pipeline is functioning as expected when used on a recently created model. This test is run after training an ensemble run in AzureML. It starts "submit_for_inference" via Popen. The inference run here is on a 2-channel model, whereas test_submit_for_inference works with a 1-channel model. """ azureml_model = get_most_recent_model(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) assert azureml_model is not None assert PYTHON_ENVIRONMENT_NAME in azureml_model.tags, "Environment name not present in model properties" # download the registered model and test that we can run the score pipeline on it model_root = Path(azureml_model.download(str(test_output_dirs.root_dir))) # The model needs to contain score.py at the root, the (merged) environment definition, # and the inference config. expected_files = [ *fixed_paths.SCRIPTS_AT_ROOT, fixed_paths.ENVIRONMENT_YAML_FILE_NAME, fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME, "InnerEye/ML/runner.py", ] for expected_file in expected_files: assert (model_root / expected_file).is_file(), f"File {expected_file} missing" checkpoint_folder = model_root / CHECKPOINT_FOLDER assert checkpoint_folder.is_dir() checkpoints = list(checkpoint_folder.rglob("*")) assert len(checkpoints) >= 1, "There must be at least 1 checkpoint" # create a dummy datastore to store the image data test_datastore = test_output_dirs.root_dir / "test_datastore" # move test data into the data folder to simulate an actual run train_and_test_data_dir = full_ml_test_data_path("train_and_test_data") img_files = ["id1_channel1.nii.gz", "id1_channel2.nii.gz"] data_root = test_datastore / fixed_paths.DEFAULT_DATA_FOLDER data_root.mkdir(parents=True) for f in img_files: shutil.copy(str(train_and_test_data_dir / f), str(data_root)) # run score pipeline as a separate process python_executable = sys.executable [return_code1, stdout1] = spawn_and_monitor_subprocess(process=python_executable, args=["--version"]) assert return_code1 == 0 print(f"Executing Python version {stdout1[0]}") return_code, stdout2 = spawn_and_monitor_subprocess(process=python_executable, args=[ str(model_root / fixed_paths.SCORE_SCRIPT), f"--data_folder={str(data_root)}", f"--image_files={img_files[0]},{img_files[1]}", "--use_gpu=False"]) # check that the process completed as expected assert return_code == 0, f"Subprocess failed with return code {return_code}. Stdout: {os.linesep.join(stdout2)}" expected_segmentation_path = Path(model_root) / DEFAULT_RESULT_IMAGE_NAME assert expected_segmentation_path.exists(), f"Result file not found: {expected_segmentation_path}" # sanity check the resulting segmentation expected_shape = get_nifti_shape(train_and_test_data_dir / img_files[0]) image_header = get_unit_image_header() assert_nifti_content(str(expected_segmentation_path), expected_shape, image_header, [3], np.ubyte)
def test_store_inference_results( test_output_dirs: OutputFolderForTests) -> None: np.random.seed(0) num_classes = 2 posterior = torch.nn.functional.softmax(torch.from_numpy( np.random.random_sample((num_classes, dim_z, dim_y, dim_x))), dim=0).numpy() segmentation = np.argmax(posterior, axis=0) assert segmentation.shape == (dim_z, dim_y, dim_x) posterior0 = to_unique_bytes(posterior[0], (0, 1)) posterior1 = to_unique_bytes(posterior[1], (0, 1)) spacing = (2.0, 2.0, 2.0) header = get_unit_image_header(spacing=spacing) inference_result = InferencePipeline.Result(epoch=1, patient_id=12, posteriors=posterior, segmentation=segmentation, voxel_spacing_mm=(1, 1, 1)) test_config = _create_config_with_folders(test_output_dirs) assert test_config.class_and_index_with_background() == { "background": 0, "region": 1 } results_folder = test_output_dirs.root_dir store_inference_results(inference_result, test_config, Path(results_folder), header) assert_nifti_content( results_folder / "012" / "posterior_background.nii.gz", segmentation.shape, header, list(posterior0), np.ubyte) assert_nifti_content(results_folder / "012" / "posterior_region.nii.gz", segmentation.shape, header, list(posterior1), np.ubyte) assert_nifti_content(results_folder / "012" / "background.nii.gz", segmentation.shape, header, list([0, 1]), np.ubyte) assert_nifti_content(results_folder / "012" / "region.nii.gz", segmentation.shape, header, list([0, 1]), np.ubyte) assert_nifti_content(results_folder / "012" / DEFAULT_RESULT_IMAGE_NAME, segmentation.shape, header, list(np.unique(segmentation)), np.ubyte) assert_nifti_content(results_folder / "012" / "uncertainty.nii.gz", inference_result.uncertainty.shape, header, list([248, 249, 253, 254]), np.ubyte)
def test_register_and_score_model(is_ensemble: bool, dataset_expected_spacing_xyz: Any, model_outside_package: bool, test_output_dirs: OutputFolderForTests) -> None: """ End-to-end test which ensures the scoring pipeline is functioning as expected by performing the following: 1) Registering a pre-trained model to AML 2) Checking that a model zip from the registered model can be created successfully 3) Calling the scoring pipeline to check inference can be run from the published model successfully """ ws = get_default_workspace() # Get an existing config as template loader = get_model_loader("Tests.ML.configs" if model_outside_package else None) config: SegmentationModelBase = loader.create_model_config_from_name( model_name="BasicModel2EpochsOutsidePackage" if model_outside_package else "BasicModel2Epochs" ) config.dataset_expected_spacing_xyz = dataset_expected_spacing_xyz config.set_output_to(test_output_dirs.root_dir) # copy checkpoints into the outputs (simulating a run) stored_checkpoints = full_ml_test_data_path(os.path.join("train_and_test_data", "checkpoints")) shutil.copytree(str(stored_checkpoints), str(config.checkpoint_folder)) paths = [config.checkpoint_folder / "1_checkpoint.pth.tar"] checkpoints = paths * 2 if is_ensemble else paths model = None model_path = None # Mocking to get the source from the current directory # the score.py and python_wrapper.py cannot be moved inside the InnerEye package, which will be the # only code running (if these tests are run on the package). with mock.patch('InnerEye.Common.fixed_paths.repository_root_directory', return_value=tests_root_directory().parent): try: tags = {"model_name": config.model_name} azure_config = get_default_azure_config() if model_outside_package: azure_config.extra_code_directory = "Tests" # contains DummyModel deployment_hook = lambda cfg, azure_cfg, mdl, is_ens: (Path(cfg.model_name), azure_cfg.docker_shm_size) ml_runner = MLRunner(config, azure_config, model_deployment_hook=deployment_hook) model, deployment_path, deployment_details = ml_runner.register_segmentation_model( workspace=ws, tags=tags, best_epoch=0, best_epoch_dice=0, checkpoint_paths=checkpoints, model_proc=ModelProcessing.DEFAULT) assert model is not None model_path = Path(model.get_model_path(model.name, model.version, ws)) assert (model_path / fixed_paths.ENVIRONMENT_YAML_FILE_NAME).exists() assert (model_path / Path("InnerEye/ML/runner.py")).exists() assert deployment_path == Path(config.model_name) assert deployment_details == azure_config.docker_shm_size # move test data into the data folder to simulate an actual run train_and_test_data_dir = full_ml_test_data_path("train_and_test_data") img_channel_1_name = "id1_channel1.nii.gz" img_channel_1_path = train_and_test_data_dir / img_channel_1_name img_channel_2_name = "id1_channel2.nii.gz" img_channel_2_path = train_and_test_data_dir / img_channel_2_name # download the registered model and test that we can run the score pipeline on it model_root = Path(model.download(str(test_output_dirs.root_dir))) # create a dummy datastore to store model checkpoints and image data # this simulates the code shapshot being executed in a real run test_datastore = test_output_dirs.root_dir / "test_datastore" shutil.move( str(model_root / "test_outputs"), str(test_datastore / RELATIVE_TEST_OUTPUTS_PATH) ) data_root = test_datastore / DEFAULT_DATA_FOLDER os.makedirs(data_root) shutil.copy(str(img_channel_1_path), data_root) shutil.copy(str(img_channel_2_path), data_root) # run score pipeline as a separate process using the python_wrapper.py code to simulate a real run return_code = SubprocessConfig(process="python", args=[ str(model_root / "python_wrapper.py"), "--spawnprocess=python", str(model_root / "score.py"), f"--data-folder={str(test_datastore)}", f"--test_image_channels={img_channel_1_name},{img_channel_2_name}", "--use_gpu=False" ]).spawn_and_monitor_subprocess() # check that the process completed as expected assert return_code == 0 expected_segmentation_path = Path(model_root) / DEFAULT_RESULT_IMAGE_NAME assert expected_segmentation_path.exists() # sanity check the resulting segmentation expected_shape = get_nifti_shape(img_channel_1_path) image_header = get_unit_image_header() assert_nifti_content(str(expected_segmentation_path), expected_shape, image_header, [0], np.ubyte) finally: # delete the registered model, and any downloaded artifacts shutil.rmtree(test_output_dirs.root_dir) if model and model_path: model.delete() shutil.rmtree(model_path)
def test_visualize_patch_sampling(test_output_dirs: TestOutputDirectories, labels_to_boundary: bool) -> None: """ Tests if patch sampling and producing diagnostic images works as expected. :param test_output_dirs: :param labels_to_boundary: If true, the ground truth labels are placed close to the image boundary, so that crops have to be adjusted inwards. If false, ground truth labels are all far from the image boundaries. """ set_random_seed(0) shape = (10, 30, 30) foreground_classes = ["fg"] class_weights = equally_weighted_classes(foreground_classes) config = SegmentationModelBase(should_validate=False, crop_size=(2, 10, 10), class_weights=class_weights) image = np.random.rand(1, *shape).astype(np.float32) * 1000 mask = np.ones(shape) labels = np.zeros((len(class_weights), ) + shape) if labels_to_boundary: # Generate foreground labels in such a way that a patch centered around a foreground pixel would # reach outside of the image. labels[1, 4:8, 3:27, 3:27] = 1 else: labels[1, 4:8, 15:18, 15:18] = 1 labels[0] = 1 - labels[1] output_folder = Path(test_output_dirs.root_dir) image_header = get_unit_image_header() sample = Sample(image=image, mask=mask, labels=labels, metadata=PatientMetadata(patient_id='123', image_header=image_header)) expected_folder = full_ml_test_data_path("patch_sampling") heatmap = visualize_random_crops(sample, config, output_folder=output_folder) expected_heatmap = expected_folder / ("sampled_to_boundary.npy" if labels_to_boundary else "sampled_center.npy") # To update the stored results, uncomment this line: # np.save(str(expected_heatmap), heatmap) assert np.allclose(heatmap, np.load( str(expected_heatmap))), "Patch sampling created a different heatmap." f1 = output_folder / "123_ct.nii.gz" assert_file_exists(f1) f2 = output_folder / "123_sampled_patches.nii.gz" assert_file_exists(f2) thumbnails = [ "123_sampled_patches_dim0.png", "123_sampled_patches_dim1.png", "123_sampled_patches_dim2.png", ] for f in thumbnails: assert_file_exists(output_folder / f) expected = expected_folder / ("sampled_to_boundary.nii.gz" if labels_to_boundary else "sampled_center.nii.gz") # To update test results: # shutil.copy(str(f2), str(expected)) expected_image = io_util.load_nifti_image(expected) actual_image = io_util.load_nifti_image(f2) np.allclose(expected_image.image, actual_image.image) if labels_to_boundary: for f in thumbnails: # Uncomment this line to update test results # (expected_folder / f).write_bytes((output_folder / f).read_bytes()) if not is_running_on_azure(): # When running on the Azure build agents, it appears that the bounding box of the images # is slightly different than on local runs, even with equal dpi settings. # Not able to figure out how to make the run results consistent, hence disable in cloud runs. assert_binary_files_match(output_folder / f, expected_folder / f)
def test_register_and_score_model( is_ensemble: bool, dataset_expected_spacing_xyz: Any, model_outside_package: bool, test_output_dirs: OutputFolderForTests) -> None: """ End-to-end test which ensures the scoring pipeline is functioning as expected by performing the following: 1) Registering a pre-trained model to AML 2) Checking that a model zip from the registered model can be created successfully 3) Calling the scoring pipeline to check inference can be run from the published model successfully """ # We are creating checkpoints on the fly in this test, writing a randomly initialized model. set_random_seed(0) # Get an existing config as template loader = get_model_loader( "Tests.ML.configs" if model_outside_package else None) config: SegmentationModelBase = loader.create_model_config_from_name( model_name="BasicModel2EpochsOutsidePackage" if model_outside_package else "BasicModel2Epochs") config.dataset_expected_spacing_xyz = dataset_expected_spacing_xyz config.set_output_to(test_output_dirs.root_dir) checkpoints_absolute = [] model_and_info = ModelAndInfo( config=config, model_execution_mode=ModelExecutionMode.TRAIN) model_and_info.create_model() model_and_info.create_optimizer() checkpoints_absolute.append(model_and_info.save_checkpoint(epoch=10)) if is_ensemble: checkpoints_absolute.append(model_and_info.save_checkpoint(epoch=20)) checkpoints_relative = [ f.relative_to(config.checkpoint_folder) for f in checkpoints_absolute ] azureml_model = None # Simulate a project root: We can't derive that from the repository root because that might point # into Python's package folder project_root = Path(__file__).parent.parent # Double-check that we are at the right place, by testing for a file that would quite certainly not be found # somewhere else assert (project_root / fixed_paths.SCORE_SCRIPT).is_file() try: azure_config = get_default_azure_config() if model_outside_package: azure_config.extra_code_directory = "Tests" # contains BasicModel2EpochsOutsidePackage deployment_hook = lambda cfg, azure_cfg, mdl, is_ens: (Path( cfg.model_name), azure_cfg.docker_shm_size) ml_runner = MLRunner(config, azure_config, project_root=project_root, model_deployment_hook=deployment_hook) registration_result = ml_runner.register_segmentation_model( model_description="", checkpoint_paths=checkpoints_absolute, model_proc=ModelProcessing.DEFAULT) assert registration_result is not None azureml_model, deployment_result = registration_result assert azureml_model is not None assert deployment_result == (Path(config.model_name), azure_config.docker_shm_size) # download the registered model and test that we can run the score pipeline on it model_root = Path( azureml_model.download(str(test_output_dirs.root_dir))) # The model needs to contain score.py at the root, the (merged) environment definition, # and the inference config. expected_files = [ *fixed_paths.SCRIPTS_AT_ROOT, fixed_paths.ENVIRONMENT_YAML_FILE_NAME, fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME, "InnerEye/ML/runner.py", ] # All checkpoints go into their own folder expected_files.extend( str(Path(CHECKPOINT_FOLDER) / c) for c in checkpoints_relative) for expected_file in expected_files: assert (model_root / expected_file).is_file(), f"File {expected_file} missing" # create a dummy datastore to store the image data test_datastore = test_output_dirs.root_dir / "test_datastore" # move test data into the data folder to simulate an actual run train_and_test_data_dir = full_ml_test_data_path("train_and_test_data") img_files = ["id1_channel1.nii.gz", "id1_channel2.nii.gz"] data_root = test_datastore / fixed_paths.DEFAULT_DATA_FOLDER data_root.mkdir(parents=True) for f in img_files: shutil.copy(str(train_and_test_data_dir / f), str(data_root)) # run score pipeline as a separate process python_executable = sys.executable [return_code1, stdout1] = SubprocessConfig(process=python_executable, args=["--version" ]).spawn_and_monitor_subprocess() assert return_code1 == 0 print(f"Executing Python version {stdout1[0]}") return_code, stdout2 = SubprocessConfig( process=python_executable, args=[ str(model_root / fixed_paths.SCORE_SCRIPT), f"--data_folder={str(data_root)}", f"--image_files={img_files[0]},{img_files[1]}", "--use_gpu=False" ]).spawn_and_monitor_subprocess() # check that the process completed as expected assert return_code == 0, f"Subprocess failed with return code {return_code}. Stdout: {os.linesep.join(stdout2)}" expected_segmentation_path = Path( model_root) / DEFAULT_RESULT_IMAGE_NAME assert expected_segmentation_path.exists( ), f"Result file not found: {expected_segmentation_path}" # sanity check the resulting segmentation expected_shape = get_nifti_shape(train_and_test_data_dir / img_files[0]) image_header = get_unit_image_header() assert_nifti_content(str(expected_segmentation_path), expected_shape, image_header, [3], np.ubyte) finally: # delete the registered model if azureml_model: azureml_model.delete()
def visualize_random_crops(sample: Sample, config: SegmentationModelBase, output_folder: Path) -> np.ndarray: """ Simulate the effect of sampling random crops (as is done for trainig segmentation models), and store the results as a Nifti heatmap and as 3 axial/sagittal/coronal slices. The heatmap and the slices are stored in the given output folder, with filenames that contain the patient ID as the prefix. :param sample: The patient information from the dataset, with scans and ground truth labels. :param config: The model configuration. :param output_folder: The folder into which the heatmap and thumbnails should be written. :return: A numpy array that has the same size as the image, containing how often each voxel was contained in """ output_folder.mkdir(exist_ok=True, parents=True) sample = CroppingDataset.create_possibly_padded_sample_for_cropping( sample=sample, crop_size=config.crop_size, padding_mode=config.padding_mode) logging.info(f"Processing sample: {sample.patient_id}") # Exhaustively sample with random crop function image_channel0 = sample.image[0] heatmap = np.zeros(image_channel0.shape, dtype=np.uint16) # Number of repeats should fit into the range of UInt16, because we will later save the heatmap as an integer # Nifti file of that datatype. repeats = 200 for _ in range(repeats): slicers, _ = augmentation.slicers_for_random_crop( sample=sample, crop_size=config.crop_size, class_weights=config.class_weights) heatmap[slicers[0], slicers[1], slicers[2]] += 1 is_3dim = heatmap.shape[0] > 1 header = sample.metadata.image_header if not header: logging.warning( f"No image header found for patient {sample.patient_id}. Using default header." ) header = get_unit_image_header() if is_3dim: ct_output_name = str(output_folder / f"{sample.patient_id}_ct.nii.gz") heatmap_output_name = str( output_folder / f"{sample.patient_id}_sampled_patches.nii.gz") io_util.store_as_nifti(image=heatmap, header=header, file_name=heatmap_output_name, image_type=heatmap.dtype, scale=False) io_util.store_as_nifti(image=image_channel0, header=header, file_name=ct_output_name, image_type=sample.image.dtype, scale=False) heatmap_scaled = heatmap.astype(dtype=np.float) / heatmap.max() # If the incoming image is effectively a 2D image with degenerate Z dimension, then only plot a single # axial thumbnail. Otherwise, plot thumbnails for all 3 dimensions. dimensions = list(range(3)) if is_3dim else [0] # Center the 3 thumbnails at one of the points where the heatmap attains a maximum. This should ensure that # the thumbnails are in an area where many of the organs of interest are located. max_heatmap_index = np.unravel_index( heatmap.argmax(), heatmap.shape) if is_3dim else (0, 0, 0) for dimension in dimensions: plt.clf() scan_with_transparent_overlay( scan=image_channel0, overlay=heatmap_scaled, dimension=dimension, position=max_heatmap_index[dimension] if is_3dim else 0, spacing=header.spacing) # Construct a filename that has a dimension suffix if we are generating 3 of them. For 2dim images, skip # the suffix. thumbnail = f"{sample.patient_id}_sampled_patches" if is_3dim: thumbnail += f"_dim{dimension}" thumbnail += ".png" resize_and_save(width_inch=5, height_inch=5, filename=output_folder / thumbnail) return heatmap