def get_save_handler(config): if exp_tracking.has_clearml: from ignite.contrib.handlers.clearml_logger import ClearMLSaver return ClearMLSaver(dirname=config.output_path.as_posix()) return DiskSaver(config.output_path.as_posix())
def test_clearml_disk_saver_integration_no_logger(): model = torch.nn.Module() to_save_serializable = {"model": model} with pytest.warns( UserWarning, match="ClearMLSaver created a temporary checkpoints directory"): clearml.Task.current_task = Mock(return_value=object()) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock( ) clearml_saver = ClearMLSaver() checkpoint = Checkpoint(to_save=to_save_serializable, save_handler=clearml_saver, n_saved=1) trainer = Engine(lambda e, b: None) trainer.state = State(epoch=0, iteration=0) checkpoint(trainer) trainer.state.iteration = 1 checkpoint(trainer) if clearml_saver._atomic: assert clearml.binding.frameworks.WeightsFileHandler.create_output_model.call_count == 2 else: saved_files = list(os.listdir(clearml_saver.dirname)) assert len(saved_files) == 1 assert saved_files[0] == "model_1.pt"
def get_save_handler(config): if config["with_clearml"]: from ignite.contrib.handlers.clearml_logger import ClearMLSaver return ClearMLSaver(dirname=config["output_path"]) return DiskSaver(config["output_path"], require_empty=False)
def get_save_handler(output_path, with_clearml): if with_clearml: from ignite.contrib.handlers.clearml_logger import ClearMLSaver return ClearMLSaver(dirname=output_path) return DiskSaver(output_path)
def create_callbacks(self): ## SETUP CALLBACKS print('[INFO] Creating callback functions for training loop...', end='') # Early Stopping - stops training if the validation loss does not decrease after 5 epochs handler = EarlyStopping(patience=self.config.EARLY_STOPPING_PATIENCE, score_function=score_function_loss, trainer=self.train_engine) self.evaluator.add_event_handler(Events.COMPLETED, handler) print('Early Stopping ({} epochs)...'.format( self.config.EARLY_STOPPING_PATIENCE), end='') val_checkpointer = Checkpoint( {"model": self.model}, ClearMLSaver(), n_saved=1, score_function=score_function_acc, score_name="val_acc", filename_prefix='cub200_{}_ignite_best'.format( self.config.MODEL.MODEL_NAME), global_step_transform=global_step_from_engine(self.train_engine), ) self.evaluator.add_event_handler(Events.EPOCH_COMPLETED, val_checkpointer) print('Model Checkpointing...', end='') print('Done')
def _test_save_model_optimizer_lr_scheduler_with_state_dict( device, on_zero_rank=False): if idist.get_rank() == 0: clearml.Task.current_task = Mock(return_value=object()) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock( ) torch.manual_seed(23) model = DummyModel().to(device) optim = torch.optim.SGD(model.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5) def update_fn(engine, batch): x = torch.rand((4, 2)).to(device) optim.zero_grad() y = model(x) # Below code raises: RuntimeError: torch_xla/csrc/tensor_impl.cpp:144 : XLA tensors do not have storage # Probably related to https://github.com/pytorch/xla/issues/2576 # loss = y.pow(2.0).sum() loss = y.sum() loss.backward() if idist.has_xla_support: import torch_xla.core.xla_model as xm xm.optimizer_step(optim, barrier=True) else: optim.step() lr_scheduler.step() engine = Engine(update_fn) to_save = { "model": model, "optimizer": optim, "lr_scheduler": lr_scheduler } with pytest.warns( UserWarning, match=r"ClearMLSaver created a temporary checkpoints directory"): clearml_saver = ClearMLSaver() if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0): checkpoint = Checkpoint(to_save=to_save, save_handler=clearml_saver, n_saved=1) engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint) engine.run([0], max_epochs=4) idist.barrier() saved_objects = sorted(os.listdir(clearml_saver.dirname)) # saved object is ['PREFIX_checkpoint_3.pt', ] saved_checkpoint = os.path.join(clearml_saver.dirname, saved_objects[0]) if idist.has_xla_support: device = "cpu" loaded_obj = torch.load(saved_checkpoint, map_location=device) for f in ["model", "optimizer", "lr_scheduler"]: assert f in loaded_obj loaded_model_state_dict = loaded_obj["model"] loaded_optimizer_state_dict = loaded_obj["optimizer"] loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"] assert isinstance(loaded_model_state_dict, dict) assert isinstance(loaded_optimizer_state_dict, dict) assert isinstance(loaded_lr_scheduler_state_dict, dict) # Specifically move device to CPU first model_state_dict = model.cpu().state_dict() for key in model_state_dict.keys(): assert key in loaded_model_state_dict model_value = model_state_dict[key] loaded_model_value = loaded_model_state_dict[key] assert (model_value.cpu().numpy() == loaded_model_value.cpu().numpy() ).all() optim_state_dict = optim.state_dict() for key in optim_state_dict.keys(): assert key in loaded_optimizer_state_dict optim_value = optim_state_dict[key] loaded_optim_value = loaded_optimizer_state_dict[key] if idist.get_rank() == 0: assert optim_value == loaded_optim_value lr_scheduler_state_dict = lr_scheduler.state_dict() for key in lr_scheduler_state_dict.keys(): assert key in loaded_lr_scheduler_state_dict lr_scheduler_value = lr_scheduler_state_dict[key] loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key] assert lr_scheduler_value == loaded_lr_scheduler_value
def test_clearml_saver_callbacks(): mock_task = MagicMock(spec=clearml.Task) mock_task.name = "check-task" mock_model = MagicMock(spec=clearml.OutputModel) model_info = WeightsFileHandler.ModelInfo( model=mock_model, upload_filename="test.pt", local_model_path="", local_model_id="", framework=Framework.pytorch, task=mock_task, ) mock_model_info = MagicMock(spec_set=model_info) # Simulate 4 calls to save model and 2 to remove (n_saved=2) filenames = [ "best_model_5_val_acc=0.123.pt", "best_model_6_val_acc=0.234.pt", "best_model_7_val_acc=0.356.pt", "best_model_8_val_acc=0.456.pt", ] metadata_list = [ { "basename": "best_model", "score_name": "val_acc", "priority": 0.123 }, { "basename": "best_model", "score_name": "val_acc", "priority": 0.234 }, { "basename": "best_model", "score_name": "val_acc", "priority": 0.345 }, { "basename": "best_model", "score_name": "val_acc", "priority": 0.456 }, ] dirname = "/tmp/test" _checkpoint_slots = defaultdict(list) n_saved = 2 for i, (filename, metadata) in enumerate(zip(filenames, metadata_list)): mock_model_info.upload_filename = filename if i >= n_saved: # Remove filename_to_remove = filenames[i % n_saved] for slots in _checkpoint_slots.values(): try: slots[slots.index(filename_to_remove)] = None except ValueError: pass else: i = i % n_saved break basename = metadata["basename"] checkpoint_key = (dirname, basename) context = ClearMLSaver._CallbacksContext( callback_type=WeightsFileHandler.CallbackType, slots=_checkpoint_slots[checkpoint_key], checkpoint_key=str(checkpoint_key), filename=filename, basename=basename, metadata=metadata, ) output_model_info = context.pre_callback( str(WeightsFileHandler.CallbackType.save), mock_model_info) assert (hasattr(output_model_info, "upload_filename") and f"{basename}_{i}.pt" in output_model_info.upload_filename) assert hasattr(output_model_info, "local_model_id") and str( checkpoint_key) in output_model_info.local_model_id output_model_info = context.post_callback( str(WeightsFileHandler.CallbackType.save), mock_model_info) assert hasattr(output_model_info, "model") and hasattr( output_model_info.model, "name") assert hasattr(output_model_info, "model") and hasattr( output_model_info.model, "comment") assert isinstance(output_model_info.model.name, str) and filename in output_model_info.model.name assert (isinstance(output_model_info.model.comment, str) and metadata["basename"] in output_model_info.model.comment and metadata["score_name"] in output_model_info.model.comment)
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite") clearml_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training metrics", train_evaluator), ("validation metrics", validation_evaluator)]: clearml_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) clearml_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) clearml_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) clearml_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) clearml_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) clearml_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) handler = Checkpoint( {"model": model}, ClearMLSaver(), n_saved=1, score_function=lambda e: e.state.metrics["accuracy"], score_name="val_acc", filename_prefix="best", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler) # kick everything off trainer.run(train_loader, max_epochs=epochs) clearml_logger.close()