def test_event_list(): e1 = Events.ITERATION_STARTED(once=1) e2 = Events.ITERATION_STARTED(every=3) e3 = Events.COMPLETED event_list = e1 | e2 | e3 assert type(event_list) == EventsList assert len(event_list) == 3 assert event_list[0] == e1 assert event_list[1] == e2 assert event_list[2] == e3
def test_callable_events_with_wrong_inputs(): with pytest.raises(ValueError, match=r"Only one of the input arguments should be specified"): Events.ITERATION_STARTED() with pytest.raises(ValueError, match=r"Only one of the input arguments should be specified"): Events.ITERATION_STARTED(event_filter="123", every=12) with pytest.raises(TypeError, match=r"Argument event_filter should be a callable"): Events.ITERATION_STARTED(event_filter="123") with pytest.raises(ValueError, match=r"Argument every should be integer and greater than zero"): Events.ITERATION_STARTED(every=-1) with pytest.raises(ValueError, match=r"but will be called with"): Events.ITERATION_STARTED(event_filter=lambda x: x)
def test_callable_event_bad_behaviour(): special_events = [1, 2, 5, 7, 17, 20] def custom_event_filter(engine, event): if event in special_events: return True return False # Check bad behaviour engine = Engine(lambda e, b: b) counter = [0, ] # Modify events Events.ITERATION_STARTED(event_filter=custom_event_filter) @engine.on(Events.ITERATION_STARTED) def assert_all_iters(engine): counter[0] += 1 assert engine.state.iteration == counter[0] d = list(range(50)) engine.run(d, max_epochs=25) assert counter[0] == engine.state.iteration
def test_state_get_event_attrib_value(): state = State() state.iteration = 10 state.epoch = 9 e = Events.ITERATION_STARTED assert state.get_event_attrib_value(e) == state.iteration e = Events.ITERATION_COMPLETED assert state.get_event_attrib_value(e) == state.iteration e = Events.EPOCH_STARTED assert state.get_event_attrib_value(e) == state.epoch e = Events.EPOCH_COMPLETED assert state.get_event_attrib_value(e) == state.epoch e = Events.STARTED assert state.get_event_attrib_value(e) == state.epoch e = Events.COMPLETED assert state.get_event_attrib_value(e) == state.epoch e = Events.ITERATION_STARTED(every=10) assert state.get_event_attrib_value(e) == state.iteration e = Events.ITERATION_COMPLETED(every=10) assert state.get_event_attrib_value(e) == state.iteration e = Events.EPOCH_STARTED(once=5) assert state.get_event_attrib_value(e) == state.epoch e = Events.EPOCH_COMPLETED(once=5) assert state.get_event_attrib_value(e) == state.epoch
def test_attach(): n_epochs = 5 data = list(range(50)) def _test(event, n_calls): losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) def update_fn(engine, batch): return next(losses_iter) trainer = Engine(update_fn) logger = DummyLogger() mock_log_handler = MagicMock() logger.attach(trainer, log_handler=mock_log_handler, event_name=event) trainer.run(data, max_epochs=n_epochs) mock_log_handler.assert_called_with(trainer, logger, event) assert mock_log_handler.call_count == n_calls _test(Events.ITERATION_STARTED, len(data) * n_epochs) _test(Events.ITERATION_COMPLETED, len(data) * n_epochs) _test(Events.EPOCH_STARTED, n_epochs) _test(Events.EPOCH_COMPLETED, n_epochs) _test(Events.STARTED, 1) _test(Events.COMPLETED, 1) _test(Events.ITERATION_STARTED(every=10), len(data) // 10 * n_epochs)
def _setup_logging( logger: BaseLogger, trainer: Engine, optimizers: Optional[Union[Optimizer, Dict[str, Optimizer], Dict[None, Optimizer]]], evaluators: Optional[Union[Engine, Dict[str, Engine]]], log_every_iters: int, ) -> None: if optimizers is not None: if not isinstance(optimizers, (Optimizer, Mapping)): raise TypeError( "Argument optimizers should be either a single optimizer or a dictionary or optimizers" ) if evaluators is not None: if not isinstance(evaluators, (Engine, Mapping)): raise TypeError( "Argument evaluators should be either a single engine or a dictionary or engines" ) if log_every_iters is None: log_every_iters = 1 logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=log_every_iters), tag="training", metric_names="all") if optimizers is not None: # Log optimizer parameters if isinstance(optimizers, Optimizer): optimizers = {None: optimizers} for k, optimizer in optimizers.items(): logger.attach_opt_params_handler( trainer, Events.ITERATION_STARTED(every=log_every_iters), optimizer, param_name="lr", tag=k) if evaluators is not None: # Log evaluation metrics if isinstance(evaluators, Engine): evaluators = {"validation": evaluators} event_name = Events.ITERATION_COMPLETED if isinstance( logger, WandBLogger) else None gst = global_step_from_engine(trainer, custom_event_name=event_name) for k, evaluator in evaluators.items(): logger.attach_output_handler(evaluator, event_name=Events.COMPLETED, tag=k, metric_names="all", global_step_transform=gst)
def test_callable_events(): assert isinstance(Events.ITERATION_STARTED.value, str) def foo(engine, event): return True ret = Events.ITERATION_STARTED(event_filter=foo) assert isinstance(ret, EventWithFilter) assert ret.event == Events.ITERATION_STARTED assert ret.filter == foo assert isinstance(Events.ITERATION_STARTED.value, str) # assert ret in Events assert Events.ITERATION_STARTED.name in "{}".format(ret) # assert ret in State.event_to_attr ret = Events.ITERATION_STARTED(every=10) assert isinstance(ret, EventWithFilter) assert ret.event == Events.ITERATION_STARTED assert ret.filter is not None # assert ret in Events assert Events.ITERATION_STARTED.name in "{}".format(ret) # assert ret in State.event_to_attr ret = Events.ITERATION_STARTED(once=10) assert isinstance(ret, EventWithFilter) assert ret.event == Events.ITERATION_STARTED assert ret.filter is not None # assert ret in Events assert Events.ITERATION_STARTED.name in "{}".format(ret) # assert ret in State.event_to_attr def _attach(e1, e2): assert id(e1) != id(e2) _attach(Events.ITERATION_STARTED(every=10), Events.ITERATION_COMPLETED(every=10))
def test_as_context_manager(): n_epochs = 5 data = list(range(50)) class _DummyLogger(BaseLogger): def __init__(self, writer): self.writer = writer def close(self): self.writer.close() def _test(event, n_calls): global close_counter close_counter = 0 losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) def update_fn(engine, batch): return next(losses_iter) writer = MagicMock() writer.close = MagicMock() with _DummyLogger(writer) as logger: assert isinstance(logger, _DummyLogger) trainer = Engine(update_fn) mock_log_handler = MagicMock() logger.attach(trainer, log_handler=mock_log_handler, event_name=event) trainer.run(data, max_epochs=n_epochs) if isinstance(event, EventWithFilter): event = event.event mock_log_handler.assert_called_with(trainer, logger, event) assert mock_log_handler.call_count == n_calls writer.close.assert_called_once_with() _test(Events.ITERATION_STARTED, len(data) * n_epochs) _test(Events.ITERATION_COMPLETED, len(data) * n_epochs) _test(Events.EPOCH_STARTED, n_epochs) _test(Events.EPOCH_COMPLETED, n_epochs) _test(Events.STARTED, 1) _test(Events.COMPLETED, 1) _test(Events.ITERATION_STARTED(every=10), len(data) // 10 * n_epochs)
def test_list_of_events(): def _test(event_list, true_iterations): engine = Engine(lambda e, b: b) iterations = [] num_calls = [0] @engine.on(event_list) def execute_some_handler(e): iterations.append(e.state.iteration) num_calls[0] += 1 engine.run(range(3), max_epochs=5) assert iterations == true_iterations assert num_calls[0] == len(true_iterations) _test( Events.ITERATION_STARTED(once=1) | Events.ITERATION_STARTED(once=1), [1, 1]) _test( Events.ITERATION_STARTED(once=1) | Events.ITERATION_STARTED(once=10), [1, 10]) _test( Events.ITERATION_STARTED(once=1) | Events.ITERATION_STARTED(every=3), [1, 3, 6, 9, 12, 15])
def setup_any_logging(logger, logger_module, trainer, optimizers, evaluators, log_every_iters): if optimizers is not None: from torch.optim.optimizer import Optimizer if not isinstance(optimizers, (Optimizer, Mapping)): raise TypeError( "Argument optimizers should be either a single optimizer or a dictionary or optimizers" ) if evaluators is not None: if not isinstance(evaluators, (Engine, Mapping)): raise TypeError( "Argument optimizers should be either a single optimizer or a dictionary or optimizers" ) if log_every_iters is None: log_every_iters = 1 logger.attach( trainer, log_handler=logger_module.OutputHandler(tag="training", metric_names="all"), event_name=Events.ITERATION_COMPLETED(every=log_every_iters), ) if optimizers is not None: # Log optimizer parameters if isinstance(optimizers, Optimizer): optimizers = {None: optimizers} for k, optimizer in optimizers.items(): logger.attach( trainer, log_handler=logger_module.OptimizerParamsHandler( optimizer, param_name="lr", tag=k), event_name=Events.ITERATION_STARTED(every=log_every_iters), ) if evaluators is not None: # Log evaluation metrics if isinstance(evaluators, Engine): evaluators = {"validation": evaluators} for k, evaluator in evaluators.items(): gst = global_step_from_engine(trainer) logger.attach( evaluator, log_handler=logger_module.OutputHandler( tag=k, metric_names="all", global_step_transform=gst), event_name=Events.COMPLETED, )
def main(args): log_dir_path = Path(args.o) try: log_dir_path.mkdir(parents=True) except FileExistsError: pass if torch.cuda.is_available(): args.device = torch.device("cuda") print("GPU mode") else: args.device = torch.device("cpu") print("CPU mode") net = PolyNet(in_channels=(1 if args.d == "mnist" else 3)).to(args.device) kwds = {"root": ".", "download": True, "transform": transforms.ToTensor()} dataset_class = {"mnist": MNIST, "cifar10": CIFAR10}[args.d] train_dataset = dataset_class(train=True, **kwds) test_dataset = dataset_class(train=False, **kwds) train_loader = data.DataLoader(train_dataset, batch_size=args.b, shuffle=True) test_loader = data.DataLoader(test_dataset, batch_size=args.b) opt = torch.optim.Adam(net.parameters(), lr=args.lr, betas=args.betas, weight_decay=args.weight_decay) trainer = create_supervised_trainer(net, opt, F.cross_entropy, device=args.device) metrics = {"accuracy": Accuracy(), "loss": Loss(F.cross_entropy)} evaluator = create_supervised_evaluator(net, metrics=metrics, device=args.device) trainer.add_event_handler( Events.EPOCH_COMPLETED, evaluate(evaluator, train_loader, test_loader, log_dir_path)) if args.cg: trainer.add_event_handler( Events.ITERATION_STARTED(once=1), computational_graph(net, train_dataset, log_dir_path, device=args.device)) trainer.run(train_loader, max_epochs=args.e)
def attach(self, engine, name, every=300): # add funcs to accumulate metrics over iterations engine.add_event_handler(Events.STARTED, self.started) engine.add_event_handler(Events.ITERATION_COMPLETED(every=every - 1), self.completed, name) if not engine.has_event_handler(self.started, Events.ITERATION_STARTED): engine.add_event_handler(Events.ITERATION_STARTED(every=every), self.started) if not engine.has_event_handler(self.iteration_completed, Events.ITERATION_COMPLETED): engine.add_event_handler(Events.ITERATION_COMPLETED, self.iteration_completed)
def test_attach(): n_epochs = 5 data = list(range(50)) def _test(event, n_calls, kwargs={}): losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) def update_fn(engine, batch): return next(losses_iter) trainer = Engine(update_fn) logger = DummyLogger() mock_log_handler = MagicMock() logger.attach(trainer, log_handler=mock_log_handler, event_name=event, **kwargs) trainer.run(data, max_epochs=n_epochs) if isinstance(event, EventsList): events = [e for e in event] else: events = [event] if len(kwargs) > 0: calls = [call(trainer, logger, e, **kwargs) for e in events] else: calls = [call(trainer, logger, e) for e in events] mock_log_handler.assert_has_calls(calls) assert mock_log_handler.call_count == n_calls _test(Events.ITERATION_STARTED, len(data) * n_epochs, kwargs={"a": 0}) _test(Events.ITERATION_COMPLETED, len(data) * n_epochs) _test(Events.EPOCH_STARTED, n_epochs) _test(Events.EPOCH_COMPLETED, n_epochs) _test(Events.STARTED, 1) _test(Events.COMPLETED, 1) _test(Events.ITERATION_STARTED(every=10), len(data) // 10 * n_epochs) _test(Events.STARTED | Events.COMPLETED, 2)
def test_pbar_on_callable_events(capsys): n_epochs = 1 loader = list(range(100)) engine = Engine(update_fn) pbar = ProgressBar() pbar.attach(engine, event_name=Events.ITERATION_STARTED(every=10), closing_event_name=Events.EPOCH_COMPLETED) engine.run(loader, max_epochs=n_epochs) captured = capsys.readouterr() err = captured.err.split("\r") err = list(map(lambda x: x.strip(), err)) err = list(filter(None, err)) actual = err[-1] expected = "Iteration: [90/100] 90%|█████████ [00:00<00:00]" assert actual == expected
def training(local_rank, config): config["device"] = "cuda" if config["active_gpu_ids"] else "cpu" rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="Carbon Black Semantic Segmentation Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = utils.get_time_stamp() else: now = f"stop-on-{config['stop_iteration']}" folder_name = ( f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_" f"backend-{idist.backend()}-{idist.get_world_size()}_{now}") output_path = Path(output_path) / folder_name output_path.mkdir(parents=True, exist_ok=True) config["output_path"] = output_path.as_posix() config["task_name"] = output_path.stem logger.info(f"Output path: {output_path}") if "cuda" in idist.device().type: config["cuda_device_name"] = torch.cuda.get_device_name(local_rank) setup_trains_logging(config) dataloader_train, dataloader_val = get_dataloaders(config) config["num_iterations_per_epoch"] = len(dataloader_train) config["num_epochs"] = round(config["num_iterations"] / config["num_iterations_per_epoch"]) model = modeling.get_model(config) optimizer = get_optimizer(model, config) loss = get_loss() lr_scheduler = get_lr_scheduler(optimizer, config) trainer = create_trainer(model, optimizer, loss, lr_scheduler, dataloader_train.sampler, config, logger) metrics = get_metrics(loss) # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) evaluator_train = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": evaluator_train, "validation": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) example_prediction_logger = ExamplePredictionLogger( tb_logger, model, device) def run_validation(engine): epoch = trainer.state.epoch state = evaluator_train.run(dataloader_train) data_subset = "Train" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) state = evaluator.run(dataloader_val) data_subset = "Val" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) example_prediction_logger.log_visualization(dataloader_val.dataset, epoch) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="validation", ) # TODO: Add early stopping # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() # noinspection PyBroadException try: trainer.run(dataloader_train, max_epochs=config["num_epochs"]) except Exception: import traceback print(traceback.format_exc()) if rank == 0: # noinspection PyUnboundLocalVariable tb_logger.close()
def run(output_path, config): device = "cuda" local_rank = config["local_rank"] distributed = backend is not None if distributed: torch.cuda.set_device(local_rank) device = "cuda" rank = dist.get_rank() if distributed else 0 torch.manual_seed(config["seed"] + rank) # Rescale batch_size and num_workers ngpus_per_node = torch.cuda.device_count() ngpus = dist.get_world_size() if distributed else 1 batch_size = config["batch_size"] // ngpus num_workers = int( (config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node) train_loader, test_loader = get_train_test_loaders( path=config["data_path"], batch_size=batch_size, distributed=distributed, num_workers=num_workers, ) model = get_model(config["model"]) model = model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank, ) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=True, ) criterion = nn.CrossEntropyLoss().to(device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return ( convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking), ) def process_function(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } trainer = Engine(process_function) train_sampler = train_loader.sampler if distributed else None to_save = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, } metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED, ) metrics = { "accuracy": Accuracy(device=device if distributed else None), "loss": Loss(criterion, device=device if distributed else None), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): torch.cuda.synchronize() train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler( Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) # Store the best model by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED( every=config["log_model_grads_every"]), ) if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format( engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format( checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
trainer = create_supervised_trainer(model, optimizer, F.cross_entropy, DEVICE, prepare_batch=prepare_batch) evaluator = create_supervised_evaluator( model, {"loss": ignite.metrics.Loss(F.cross_entropy)}, DEVICE, prepare_batch=prepare_batch) def foo(_): print(f"Before iteration {trainer.state.iteration} (counting from 1):") evaluator.run(train_dataloader, epoch_length=math.ceil(10000 / BATCH_SIZE)) print(f"\tEstimate of train loss = {evaluator.state.metrics['loss']}") evaluator.run(val_dataloader, epoch_length=math.ceil(10000 / BATCH_SIZE)) print(f"\tEstimate of val loss = {evaluator.state.metrics['loss']}") trainer.on(Events.ITERATION_STARTED(once=True))(foo) trainer.on(Events.ITERATION_STARTED(every=15))(foo) @trainer.on(Events.EPOCH_COMPLETED) def print_first_index(_): print(f"{trainer.state.batch[2][0]=}") trainer.run(train_dataloader, 3)
def test_callable_events_every_eq_one(): e = Events.ITERATION_STARTED(every=1) assert not isinstance(e, EventWithFilter) assert isinstance(e, Events)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info(f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def configure_wandb_logging(trainer, evaluator, test_evaluator, model, criterion, optimizer, args): if args.dev_mode: os.environ["WANDB_MODE"] = "dryrun" wandb_logger = WandBLogger(dir=str(args.output_dir)) wandb_logger.watch(model, criterion, log="all", log_freq=args.log_interval) # Log training-specific metrics. wandb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=args.log_interval), tag="training", output_transform=lambda output: {"batchloss": output["loss"]}, global_step_transform=lambda *_: trainer.state.iteration, ) # Configure basic metric logging. for tag, engine in [("training", trainer), ("validation", evaluator), ("test", test_evaluator)]: wandb_logger.attach_output_handler( engine, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names="all", global_step_transform=lambda *_: trainer.state.iteration, ) # Track the epoch associated with the current training iteration. @trainer.on(Events.ITERATION_STARTED(every=args.log_interval)) def log_epoch(engine: Engine): wandb_logger.log({"epoch": engine.state.epoch}, step=engine.state.iteration, commit=False) @trainer.on(CustomEvents.EXAMPLE_PREDICTIONS_READY) def log_example_predictions(engine: Engine): for tag, (x, y, y_pred) in engine.state.examples.items(): x, y, y_pred = x.numpy(), y.numpy(), y_pred.numpy() # Convert log scale (torch.log_softmax) predictions. y_pred = np.exp(y_pred) # Prepare images for plotting. moments = engine.state.dataloader.dataset.moments x = x.transpose(0, 2, 3, 1) # NCHW -> NHWC x = x * moments["std"] + moments[ "mean"] # Denormalize using dataset moments x = x.clip(0, 1) # Plot grid of predictions for "example" batch. idx_to_class = { v: k for k, v in engine.state.dataloader.dataset.class_to_idx.items() } image = prediction_grid(x, y, y_pred, idx_to_class) # Save the prediction grid both to file system and W&B. wandb_logger.log({f"{tag}/examples": wandb_logger.Image(image)}, step=engine.state.iteration)
ProgressBar(persist=False, desc='Test evaluation').attach(valid_evaluator) log_handler = OutputHandler(tag='train', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)) tb_logger.attach(train_evaluator, log_handler=log_handler, event_name=Events.COMPLETED) log_handler = OutputHandler(tag='test', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)) tb_logger.attach(valid_evaluator, log_handler=log_handler, event_name=Events.COMPLETED) # Store the best model by validation accuracy: common.save_best_model_by_val_score(str(output_path), valid_evaluator, model=model, metric_name='accuracy', n_saved=3, trainer=trainer, tag='val') if hp['log_grads_every_iters'] is not None and hp['log_grads_every_iters'] > 0: tb_logger.attach(trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=hp['log_grads_every_iters'])) if hp['crash_iteration'] is not None and hp['crash_iteration'] >= 0: @trainer.on(Events.ITERATION_STARTED(once=hp['crash_iteration'])) def _(engine): raise Exception('STOP at iteration: {}'.format(engine.state.iteration)) if nni_compression_pruner is not None: # Notify NNI compressor (pruning or quantization) for each epoch and eventually each steps/batch-iteration if need by provided Pruner/Quantizer (see NNI Compression Documentation for more details: https://nni.readthedocs.io/en/latest/Compressor/QuickStart.html#apis-for-updating-fine-tuning-status) @trainer.on(Events.EPOCH_STARTED) def _nni_compression_update_epoch(engine): nni_compression_pruner.update_epoch(engine.state.epoch) if getattr(nni_compression_pruner, 'step', None) is Callable: @trainer.on(Events.ITERATION_COMPLETED) def _nni_compression_batch_step(engine): nni_compression_pruner.step() _resume_training(hp.get('resume_from'), to_save)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="ImageNet-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_imagenet_dataloader(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_supervised_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format( trainer.state.iteration)) trainer.terminate() @trainer.on(Events.ITERATION_COMPLETED(every=20)) def print_acc(engine): if rank == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\ .format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.saved_batch_loss )) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def add_handlers( trainer, validator, train_loader, validation_loader, model, optimizer, config, ): training_saver = ModelCheckpoint( dirname=os.path.join(config.experiment_name, 'checkpoint'), filename_prefix='ckpt', n_saved=5, require_empty=False, ) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=config.ckpt_interval), training_saver, { 'model': model, 'optimizer': optimizer }, ) train_images_dir = os.path.join( config.experiment_name, 'images', 'train', ) os.makedirs(train_images_dir, exist_ok=True) def get_frame(engine): return Image.fromarray( np.rint(engine.state.frame.transpose(1, 2, 0) * 255).astype( np.uint8)) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=config.dump_interval), handlers.dump_image, folder=train_images_dir, get_image_fn=get_frame, suffix='frame', ) def get_saliency(engine): salmap = engine.state.saliency[0] salmap = transforms._scale_values(salmap) return Image.fromarray(np.rint(salmap * 255).astype(np.uint8)) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=config.dump_interval), handlers.dump_image, folder=train_images_dir, get_image_fn=get_saliency, suffix='saliency', ) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config.vld_every_epoch), handlers.copy_dir, src=config.experiment_name, dst=os.path.join(config.experiments_dir, config.experiment_name), ) trainer.add_event_handler( Events.EPOCH_STARTED, handlers.reset_accumulators, ) trainer.add_event_handler( Events.EPOCH_STARTED, handlers.reset_accumulators, ) trainer.add_event_handler( Events.EPOCH_STARTED, handlers.log_epoch, ) trainer.add_event_handler( Events.ITERATION_STARTED(every=config.log_interval), handlers.log_iteration, ) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=config.log_interval), handlers.log_metrics, ) trainer.add_event_handler( Events.ITERATION_COMPLETED, handlers.update_accumulators, ) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=config.log_interval), handlers.log_learning_rate, optimizer=optimizer, ) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config.vld_every_epoch), handlers.run_on_validation, validation_engine=validator, validation_loader=validation_loader, ) def get_nss(engine): return engine.state.mean_losses['NSS'] handler = EarlyStopping( patience=10, score_function=get_nss, trainer=trainer, ) validator.add_event_handler(Events.COMPLETED, handler) best_model_saver = ModelCheckpoint(dirname=os.path.join( config.experiment_name, 'best_models'), filename_prefix='best', score_name='nss', score_function=get_nss, n_saved=config.best_model_count, require_empty=False) validator.add_event_handler( Events.COMPLETED, best_model_saver, {'model': model}, ) validation_images_dir = os.path.join( config.experiment_name, 'images', 'validation', ) os.makedirs(validation_images_dir, exist_ok=True) validator.add_event_handler( Events.ITERATION_COMPLETED(every=config.dump_interval), handlers.dump_image, folder=validation_images_dir, get_image_fn=get_frame, suffix='frame', ) validator.add_event_handler( Events.ITERATION_COMPLETED(every=config.dump_interval), handlers.dump_image, folder=validation_images_dir, get_image_fn=get_saliency, suffix='saliency', ) validator.add_event_handler( Events.EPOCH_STARTED, handlers.reset_accumulators, ) validator.add_event_handler( Events.EPOCH_COMPLETED, handlers.log_metrics, reset_after_log=False, ) validator.add_event_handler( Events.ITERATION_COMPLETED, handlers.update_accumulators, )
def run(output_path, config): distributed = dist.is_available() and dist.is_initialized() rank = dist.get_rank() if distributed else 0 manual_seed(config["seed"] + rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = utils.get_dataflow(config, distributed) model, optimizer = utils.get_model_optimizer(config, distributed) criterion = nn.CrossEntropyLoss().to(utils.device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # Setup Ignite trainer: # - let's define training step # - add other common handlers: # - TerminateOnNan, # - handler to setup learning rate scheduling, # - ModelCheckpoint # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations def train_step(engine, batch): x = convert_tensor(batch[0], device=utils.device, non_blocking=True) y = convert_tensor(batch[1], device=utils.device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } if config["deterministic"] and rank == 0: print("Setup deterministic trainer") trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step) train_sampler = train_loader.sampler if distributed else None to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: # Setup Tensorboard logger - wrapper on SummaryWriter tb_logger = TensorboardLogger(log_dir=output_path) # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) # log optimizer's parameters: "lr" every iteration tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED ) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(device=utils.device if distributed else None), "loss": Loss(criterion, device=utils.device if distributed else None), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) def run_validation(engine): train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: # Setup progress bar on evaluation engines if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # Optionally log model gradients if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]), ) # In order to check training resuming we can emulate a crash if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format(engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()