def model(method: str, labeled: bool, backbone: str) -> RegistrationModel: """ A specific registration model object. :param method: name of method :param labeled: whether the data is labeled :param backbone: name of backbone :return: the built object """ copied = deepcopy(config) copied["method"] = method copied["backbone"]["name"] = backbone # type: ignore if method == "conditional": copied["backbone"].pop("control_points", None) # type: ignore copied["backbone"].update(backbone_args[backbone]) # type: ignore return REGISTRY.build_model( # type: ignore config=dict( name=method, # TODO we store method twice moving_image_size=moving_image_size, fixed_image_size=fixed_image_size, index_size=index_size, labeled=labeled, batch_size=batch_size, config=copied, ))
def test_image_loss(self, config: dict, option: int, expected: int): method = "ddf" backbone = "local" labeled = True copied = deepcopy(config) copied["method"] = method copied["backbone"]["name"] = backbone copied["backbone"] = { **backbone_args[backbone], # type: ignore **copied["backbone"], } if option == 0: # remove image loss config, so loss is not used copied["loss"].pop("image") elif option == 1: # set image loss weight to zero, so loss is not used copied["loss"]["image"]["weight"] = 0.0 elif option == 2: # remove image loss weight, so loss is used with default weight 1 copied["loss"]["image"].pop("weight") ddf_model = REGISTRY.build_model(config=dict( name=method, # TODO we store method twice moving_image_size=moving_image_size, fixed_image_size=fixed_image_size, index_size=index_size, labeled=labeled, batch_size=batch_size, config=copied, )) assert len(ddf_model._model.losses) == expected # type: ignore
def predict( gpu: str, gpu_allow_growth: bool, ckpt_path: str, mode: str, batch_size: int, exp_name: str, config_path: Union[str, List[str]], save_nifti: bool = True, save_png: bool = True, log_dir: str = "logs", ): """ Function to predict some metrics from the saved model and logging results. :param gpu: which env gpu to use. :param gpu_allow_growth: whether to allow gpu growth or not :param ckpt_path: where model is stored, should be like log_folder/save/ckpt-x :param mode: train / valid / test, to define which split of dataset to be evaluated :param batch_size: int, batch size to perform predictions in :param exp_name: name of the experiment :param log_dir: path of the log directory :param save_nifti: if true, outputs will be saved in nifti format :param save_png: if true, outputs will be saved in png format :param config_path: to overwrite the default config """ # TODO support custom sample_label logging.warning("sample_label is not used in predict. " "It is True if and only if mode == 'train'.") # env vars os.environ["CUDA_VISIBLE_DEVICES"] = gpu os.environ[ "TF_FORCE_GPU_ALLOW_GROWTH"] = "false" if gpu_allow_growth else "true" # load config config, log_dir, ckpt_path = build_config(config_path=config_path, log_dir=log_dir, exp_name=exp_name, ckpt_path=ckpt_path) preprocess_config = config["train"]["preprocess"] # batch_size corresponds to batch_size per GPU gpus = tf.config.experimental.list_physical_devices("GPU") preprocess_config["batch_size"] = batch_size * max(len(gpus), 1) # data data_loader, dataset, _ = build_dataset( dataset_config=config["dataset"], preprocess_config=preprocess_config, mode=mode, training=False, repeat=False, ) assert data_loader is not None # optimizer optimizer = opt.build_optimizer( optimizer_config=config["train"]["optimizer"]) # model model: tf.keras.Model = REGISTRY.build_model(config=dict( name=config["train"]["method"], moving_image_size=data_loader.moving_image_shape, fixed_image_size=data_loader.fixed_image_shape, index_size=data_loader.num_indices, labeled=config["dataset"]["labeled"], batch_size=config["train"]["preprocess"]["batch_size"], config=config["train"], )) # metrics model.compile(optimizer=optimizer) # load weights if ckpt_path.endswith(".ckpt"): # for ckpt from tf.keras.callbacks.ModelCheckpoint # skip warnings because of optimizers # https://stackoverflow.com/questions/58289342/tf2-0-translation-model-error-when-restoring-the-saved-model-unresolved-object model.load_weights(ckpt_path).expect_partial() # pragma: no cover else: # for ckpts from ckpt manager callback _, _ = build_checkpoint_callback( model=model, dataset=dataset, log_dir=log_dir, save_period=config["train"]["save_period"], ckpt_path=ckpt_path, ) # predict fixed_grid_ref = tf.expand_dims( layer_util.get_reference_grid(grid_size=data_loader.fixed_image_shape), axis=0) # shape = (1, f_dim1, f_dim2, f_dim3, 3) predict_on_dataset( dataset=dataset, fixed_grid_ref=fixed_grid_ref, model=model, model_method=config["train"]["method"], save_dir=os.path.join(log_dir, "test"), save_nifti=save_nifti, save_png=save_png, ) # close the opened files in data loaders data_loader.close()
def train( gpu: str, config_path: Union[str, List[str]], gpu_allow_growth: bool, ckpt_path: str, exp_name: str = "", log_dir: str = "logs", max_epochs: int = -1, ): """ Function to train a model. :param gpu: which local gpu to use to train. :param config_path: path to configuration set up. :param gpu_allow_growth: whether to allocate whole GPU memory for training. :param ckpt_path: where to store training checkpoints. :param log_dir: path of the log directory. :param exp_name: experiment name. :param max_epochs: if max_epochs > 0, will use it to overwrite the configuration. """ # set env variables os.environ["CUDA_VISIBLE_DEVICES"] = gpu os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" if gpu_allow_growth else "false" # load config config, log_dir, ckpt_path = build_config( config_path=config_path, log_dir=log_dir, exp_name=exp_name, ckpt_path=ckpt_path, max_epochs=max_epochs, ) # build dataset data_loader_train, dataset_train, steps_per_epoch_train = build_dataset( dataset_config=config["dataset"], preprocess_config=config["train"]["preprocess"], mode="train", training=True, repeat=True, ) assert data_loader_train is not None # train data should not be None data_loader_val, dataset_val, steps_per_epoch_val = build_dataset( dataset_config=config["dataset"], preprocess_config=config["train"]["preprocess"], mode="valid", training=False, repeat=True, ) # use strategy to support multiple GPUs # the network is mirrored in each GPU so that we can use larger batch size # https://www.tensorflow.org/guide/distributed_training # only model, optimizer and metrics need to be defined inside the strategy num_devices = max(len(tf.config.list_physical_devices("GPU")), 1) if num_devices > 1: strategy = tf.distribute.MirroredStrategy() # pragma: no cover else: strategy = tf.distribute.get_strategy() with strategy.scope(): model: tf.keras.Model = REGISTRY.build_model( config=dict( name=config["train"]["method"], moving_image_size=data_loader_train.moving_image_shape, fixed_image_size=data_loader_train.fixed_image_shape, index_size=data_loader_train.num_indices, labeled=config["dataset"]["labeled"], batch_size=config["train"]["preprocess"]["batch_size"], config=config["train"], num_devices=num_devices, ) ) optimizer = opt.build_optimizer(optimizer_config=config["train"]["optimizer"]) # compile model.compile(optimizer=optimizer) model.plot_model(output_dir=log_dir) # build callbacks tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=log_dir, histogram_freq=config["train"]["save_period"] ) ckpt_callback, initial_epoch = build_checkpoint_callback( model=model, dataset=dataset_train, log_dir=log_dir, save_period=config["train"]["save_period"], ckpt_path=ckpt_path, ) callbacks = [tensorboard_callback, ckpt_callback] # train # it's necessary to define the steps_per_epoch # and validation_steps to prevent errors like # BaseCollectiveExecutor::StartAbort Out of range: End of sequence model.fit( x=dataset_train, steps_per_epoch=steps_per_epoch_train, initial_epoch=initial_epoch, epochs=config["train"]["epochs"], validation_data=dataset_val, validation_steps=steps_per_epoch_val, callbacks=callbacks, ) # close file loaders in data loaders after training data_loader_train.close() if data_loader_val is not None: data_loader_val.close()
def train( gpu: str, config_path: Union[str, List[str]], ckpt_path: str, num_workers: int = 1, gpu_allow_growth: bool = True, exp_name: str = "", log_dir: str = "logs", max_epochs: int = -1, ): """ Function to train a model. :param gpu: which local gpu to use to train. :param config_path: path to configuration set up. :param ckpt_path: where to store training checkpoints. :param num_workers: number of cpu cores to be used, <=0 means not limited. :param gpu_allow_growth: whether to allocate whole GPU memory for training. :param log_dir: path of the log directory. :param exp_name: experiment name. :param max_epochs: if max_epochs > 0, will use it to overwrite the configuration. """ # set env variables os.environ["CUDA_VISIBLE_DEVICES"] = gpu os.environ[ "TF_FORCE_GPU_ALLOW_GROWTH"] = "true" if gpu_allow_growth else "false" if num_workers <= 0: # pragma: no cover logger.info( "Limiting CPU usage by setting environment variables " "OMP_NUM_THREADS, TF_NUM_INTRAOP_THREADS, TF_NUM_INTEROP_THREADS to %d. " "This may slow down the training. " "Please use --num_workers flag to modify the behavior. " "Setting to 0 or negative values will remove the limitation.", num_workers, ) # limit CPU usage # https://github.com/tensorflow/tensorflow/issues/29968#issuecomment-789604232 os.environ["OMP_NUM_THREADS"] = str(num_workers) os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_workers) os.environ["TF_NUM_INTEROP_THREADS"] = str(num_workers) # load config config, log_dir, ckpt_path = build_config( config_path=config_path, log_dir=log_dir, exp_name=exp_name, ckpt_path=ckpt_path, max_epochs=max_epochs, ) # build dataset data_loader_train, dataset_train, steps_per_epoch_train = build_dataset( dataset_config=config["dataset"], preprocess_config=config["train"]["preprocess"], split="train", training=True, repeat=True, ) assert data_loader_train is not None # train data should not be None data_loader_val, dataset_val, steps_per_epoch_val = build_dataset( dataset_config=config["dataset"], preprocess_config=config["train"]["preprocess"], split="valid", training=False, repeat=True, ) # use strategy to support multiple GPUs # the network is mirrored in each GPU so that we can use larger batch size # https://www.tensorflow.org/guide/distributed_training # only model, optimizer and metrics need to be defined inside the strategy num_devices = max(len(tf.config.list_physical_devices("GPU")), 1) batch_size = config["train"]["preprocess"]["batch_size"] if num_devices > 1: # pragma: no cover strategy = tf.distribute.MirroredStrategy() if batch_size % num_devices != 0: raise ValueError( f"batch size {batch_size} can not be divided evenly " f"by the number of devices.") else: strategy = tf.distribute.get_strategy() with strategy.scope(): model: tf.keras.Model = REGISTRY.build_model(config=dict( name=config["train"]["method"], moving_image_size=data_loader_train.moving_image_shape, fixed_image_size=data_loader_train.fixed_image_shape, index_size=data_loader_train.num_indices, labeled=config["dataset"]["train"]["labeled"], batch_size=batch_size, config=config["train"], )) optimizer = opt.build_optimizer( optimizer_config=config["train"]["optimizer"]) model.compile(optimizer=optimizer) model.plot_model(output_dir=log_dir) # build callbacks tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=log_dir, histogram_freq=config["train"]["save_period"], update_freq=config["train"].get("update_freq", "epoch"), ) ckpt_callback, initial_epoch = build_checkpoint_callback( model=model, dataset=dataset_train, log_dir=log_dir, save_period=config["train"]["save_period"], ckpt_path=ckpt_path, ) callbacks = [tensorboard_callback, ckpt_callback] # train # it's necessary to define the steps_per_epoch # and validation_steps to prevent errors like # BaseCollectiveExecutor::StartAbort Out of range: End of sequence model.fit( x=dataset_train, steps_per_epoch=steps_per_epoch_train, initial_epoch=initial_epoch, epochs=config["train"]["epochs"], validation_data=dataset_val, validation_steps=steps_per_epoch_val, callbacks=callbacks, ) # close file loaders in data loaders after training data_loader_train.close() if data_loader_val is not None: data_loader_val.close()
def predict( gpu: str, ckpt_path: str, split: str, batch_size: int, exp_name: str, config_path: Union[str, List[str]], num_workers: int = 1, gpu_allow_growth: bool = True, save_nifti: bool = True, save_png: bool = True, log_dir: str = "logs", ): """ Function to predict some metrics from the saved model and logging results. :param gpu: which env gpu to use. :param ckpt_path: where model is stored, should be like log_folder/save/ckpt-x. :param split: train / valid / test, to define the split to be evaluated. :param batch_size: int, batch size to perform predictions. :param exp_name: name of the experiment. :param config_path: to overwrite the default config. :param num_workers: number of cpu cores to be used, <=0 means not limited. :param gpu_allow_growth: whether to allocate whole GPU memory for training. :param save_nifti: if true, outputs will be saved in nifti format. :param save_png: if true, outputs will be saved in png format. :param log_dir: path of the log directory. """ # env vars os.environ["CUDA_VISIBLE_DEVICES"] = gpu os.environ[ "TF_FORCE_GPU_ALLOW_GROWTH"] = "false" if gpu_allow_growth else "true" if num_workers <= 0: # pragma: no cover logger.info( "Limiting CPU usage by setting environment variables " "OMP_NUM_THREADS, TF_NUM_INTRAOP_THREADS, TF_NUM_INTEROP_THREADS to %d. " "This may slow down the prediction. " "Please use --num_workers flag to modify the behavior. " "Setting to 0 or negative values will remove the limitation.", num_workers, ) # limit CPU usage # https://github.com/tensorflow/tensorflow/issues/29968#issuecomment-789604232 os.environ["OMP_NUM_THREADS"] = str(num_workers) os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_workers) os.environ["TF_NUM_INTEROP_THREADS"] = str(num_workers) # load config config, log_dir, ckpt_path = build_config(config_path=config_path, log_dir=log_dir, exp_name=exp_name, ckpt_path=ckpt_path) config["train"]["preprocess"]["batch_size"] = batch_size # data data_loader, dataset, _ = build_dataset( dataset_config=config["dataset"], preprocess_config=config["train"]["preprocess"], split=split, training=False, repeat=False, ) assert data_loader is not None # use strategy to support multiple GPUs # the network is mirrored in each GPU so that we can use larger batch size # https://www.tensorflow.org/guide/distributed_training # only model, optimizer and metrics need to be defined inside the strategy num_devices = max(len(tf.config.list_physical_devices("GPU")), 1) if num_devices > 1: # pragma: no cover strategy = tf.distribute.MirroredStrategy() if batch_size % num_devices != 0: raise ValueError( f"batch size {batch_size} can not be divided evenly " f"by the number of devices.") else: strategy = tf.distribute.get_strategy() with strategy.scope(): model: tf.keras.Model = REGISTRY.build_model(config=dict( name=config["train"]["method"], moving_image_size=data_loader.moving_image_shape, fixed_image_size=data_loader.fixed_image_shape, index_size=data_loader.num_indices, labeled=config["dataset"][split]["labeled"], batch_size=batch_size, config=config["train"], )) optimizer = opt.build_optimizer( optimizer_config=config["train"]["optimizer"]) model.compile(optimizer=optimizer) model.plot_model(output_dir=log_dir) # load weights if ckpt_path.endswith(".ckpt"): # for ckpt from tf.keras.callbacks.ModelCheckpoint # skip warnings because of optimizers # https://stackoverflow.com/questions/58289342/tf2-0-translation-model-error-when-restoring-the-saved-model-unresolved-object model.load_weights(ckpt_path).expect_partial() # pragma: no cover else: # for ckpts from ckpt manager callback _, _ = build_checkpoint_callback( model=model, dataset=dataset, log_dir=log_dir, save_period=config["train"]["save_period"], ckpt_path=ckpt_path, ) # predict fixed_grid_ref = tf.expand_dims( layer_util.get_reference_grid(grid_size=data_loader.fixed_image_shape), axis=0) # shape = (1, f_dim1, f_dim2, f_dim3, 3) predict_on_dataset( dataset=dataset, fixed_grid_ref=fixed_grid_ref, model=model, save_dir=os.path.join(log_dir, "test"), save_nifti=save_nifti, save_png=save_png, ) # close the opened files in data loaders data_loader.close()