def _setup_seed(self, _=None, iter_counter=None, iteration=None): if iter_counter is None: le = self._dataloader_len if self._dataloader_len is not None else 1 else: le = iter_counter if iteration is None: iteration = self.state.iteration manual_seed(self.state.seed + iteration // le)
def user_handler(): manual_seed(22) _ = [ random.random(), torch.rand(2), ] if with_numpy: _ = np.random.rand(2)
def _setup_seed(self, _: Any = None, iter_counter: Optional[int] = None, iteration: Optional[int] = None) -> None: if iter_counter is None: le = self._dataloader_len if self._dataloader_len is not None else 1 else: le = iter_counter if iteration is None: iteration = self.state.iteration manual_seed(self.state.seed + iteration // le) # type: ignore[operator]
def _init_distribution(self): self.rank = idist.get_rank() manual_seed(42 + self.rank) self.device = idist.device() if self.train_ds: if self.train_ds.sampler is not None: sampler = self.train_ds.sampler(self.train_ds, self.train_ds.get_label) isShuffle = False else: sampler = None isShuffle = True self.train_loader = idist.auto_dataloader( self.train_ds, batch_size=self.hparams.train_bs, num_workers=self.hparams.train_num_workers, shuffle=isShuffle, drop_last=True, sampler=sampler, **self.train_ds.additional_loader_params) if self.valid_ds: self.valid_loader = idist.auto_dataloader( self.valid_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.valid_ds.additional_loader_params) if self.test_ds: self.test_loader = idist.auto_dataloader( self.test_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.test_ds.additional_loader_params) if USE_AMP: self._init_optimizer() self.model = idist.auto_model(self.model) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") else: self.model = idist.auto_model(self.model) if not USE_AMP: self._init_optimizer() self.optimizer = idist.auto_optim(self.optimizer) self._init_scheduler() self.criterion = self.criterion.to(self.device)
def evaluation(local_rank, config, logger, with_clearml): rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) data_loader = config.data_loader model = config.model.to(device) # Load weights: state_dict = get_model_weights(config, logger, with_clearml) model.load_state_dict(state_dict) # Adapt model to dist config model = idist.auto_model(model) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") # Setup Tensorboard logger if rank == 0: tb_logger = common.TensorboardLogger( log_dir=config.output_path.as_posix()) tb_logger.attach_output_handler( evaluator, event_name=Events.COMPLETED, tag="validation", metric_names="all", ) # Log confusion matrix to ClearML: if with_clearml: evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, evaluator.state.iteration) state = evaluator.run(data_loader) utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation", state.metrics) if idist.get_rank() == 0: tb_logger.close()
def _setup_seed(self, _: Any = None, iter_counter: Optional[int] = None, iteration: Optional[int] = None) -> None: if iter_counter is None: le = self._dataloader_len if self._dataloader_len is not None else 1 elif not iter_counter > 0: raise ValueError("iter_counter should be positive value") else: le = iter_counter if iteration is None: iteration = self.state.iteration manual_seed(self.state.seed + iteration // le) # type: ignore[operator]
def test_psnr(): device = "cuda:0" if torch.cuda.is_available() else "cpu" # test for float manual_seed(42) y_pred = torch.rand(8, 3, 28, 28, device=device) y = y_pred * 0.8 data_range = (y.max() - y.min()).cpu().item() _test_psnr(y_pred, y, data_range, device) # test for YCbCr manual_seed(42) y_pred = torch.randint(16, 236, (4, 1, 12, 12), dtype=torch.uint8, device=device) y = torch.randint(16, 236, (4, 1, 12, 12), dtype=torch.uint8, device=device) data_range = (y.max() - y.min()).cpu().item() _test_psnr(y_pred, y, data_range, device) # test for uint8 manual_seed(42) y_pred = torch.randint(0, 256, (4, 3, 16, 16), dtype=torch.uint8, device=device) y = (y_pred * 0.8).to(torch.uint8) data_range = (y.max() - y.min()).cpu().item() _test_psnr(y_pred, y, data_range, device) # test with NHW shape manual_seed(42) y_pred = torch.rand(8, 28, 28, device=device) y = y_pred * 0.8 data_range = (y.max() - y.min()).cpu().item() _test_psnr(y_pred, y, data_range, device)
def _test_keep_random_state(with_numpy): manual_seed(54) true_values = [] for _ in range(5): t = [ torch.tensor([random.random()]), torch.rand(2), ] if with_numpy: t.append(torch.from_numpy(np.random.rand(2))) true_values.append(t) @keep_random_state def user_handler(): manual_seed(22) _ = [ random.random(), torch.rand(2), ] if with_numpy: _ = np.random.rand(2) manual_seed(54) res_values = [] for _ in range(5): r = [ torch.tensor([random.random()]), torch.rand(2), ] if with_numpy: r.append(torch.from_numpy(np.random.rand(2))) res_values.append(r) user_handler() for a, b in zip(true_values, res_values): for i, j in zip(a, b): assert (i == j).all()
def test_concepts_snippet_resume(): # Commented imports required in the snippet # import torch # from torch.utils.data import DataLoader # from ignite.engine import DeterministicEngine # from ignite.utils import manual_seed seen_batches = [] manual_seed(seed=15) def random_train_data_loader(size): data = torch.arange(0, size) return DataLoader(data, batch_size=4, shuffle=True) def print_train_data(engine, batch): i = engine.state.iteration e = engine.state.epoch print("train", e, i, batch.tolist()) seen_batches.append(batch) trainer = DeterministicEngine(print_train_data) print("Original Run") manual_seed(56) trainer.run(random_train_data_loader(40), max_epochs=2, epoch_length=5) original_batches = list(seen_batches) seen_batches = [] print("Resumed Run") trainer.load_state_dict({ "epoch": 1, "epoch_length": 5, "max_epochs": 2, "rng_states": None }) manual_seed(56) trainer.run(random_train_data_loader(40)) resumed_batches = list(seen_batches) seen_batches = [] for b1, b2 in zip(original_batches[5:], resumed_batches): assert (b1 == b2).all()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="ImageNet-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_imagenet_dataloader(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_supervised_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format( trainer.state.iteration)) trainer.terminate() @trainer.on(Events.ITERATION_COMPLETED(every=20)) def print_acc(engine): if rank == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\ .format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.saved_batch_loss )) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any): """function to be run by idist.Parallel context manager.""" # ---------------------- # make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) # ----------------------- # create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.model}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- # TODO : PLEASE provide your custom datasets and dataloaders configurations # we can use `idist.auto_dataloader` to handle distributed configurations # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader train_dataset, eval_dataset = get_datasets(path=config.data_path) train_dataloader = idist.auto_dataloader( train_dataset, batch_size=config.train_batch_size, num_workers=config.num_workers, shuffle=True, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) eval_dataloader = idist.auto_dataloader( eval_dataset, batch_size=config.eval_batch_size, num_workers=config.num_workers, shuffle=False, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ device = idist.device() config.num_iters_per_epoch = len(train_dataloader) model, optimizer, loss_fn, lr_scheduler = initialize(config=config) # ----------------------------- # trainer and evaluator # ----------------------------- trainer, evaluator = create_trainers( config=config, model=model, optimizer=optimizer, loss_fn=loss_fn, device=device, ) # --------------------------------- # attach metrics to evaluator # --------------------------------- accuracy = Accuracy(device=device) metrics = { "eval_accuracy": accuracy, "eval_loss": Loss(loss_fn, device=device), "eval_error": (1.0 - accuracy) * 100, } for name, metric in metrics.items(): metric.attach(evaluator, name) # ------------------------------------------- # setup engines logger with python logging # print training configurations # ------------------------------------------- logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger evaluator.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model=model, trainer=trainer, evaluator=evaluator, metric_name="eval_accuracy", es_metric_name="eval_accuracy", to_save=to_save, lr_scheduler=lr_scheduler, output_names=None, ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger( config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer ) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # --------------------------------------------- # run evaluation at every training epoch end # with shortcut `on` decorator API and # print metrics to the stderr # again with `add_event_handler` API # for evaluation stats # --------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED(every=1)) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, "eval") # -------------------------------------------------- # let's try run evaluation first as a sanity check # -------------------------------------------------- @trainer.on(Events.STARTED) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
def training(rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() # Define output folder: config.output = "/tmp/output" model = idist.auto_model(config.model) optimizer = idist.auto_optim(config.optimizer) criterion = config.criterion train_set, val_set = config.train_set, config.val_set train_loader = idist.auto_dataloader(train_set, batch_size=config.train_batch_size) val_loader = idist.auto_dataloader(val_set, batch_size=config.val_batch_size) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval)) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) if rank == 0: tb_logger = TensorboardLogger(log_dir=config.output) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) model_checkpoint = ModelCheckpoint( config.output, n_saved=2, filename_prefix="best", score_name="accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) trainer.run(train_loader, max_epochs=config.num_epochs) if rank == 0: tb_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) task.connect(config) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) logger.info( f"# model parameters (M): {sum([m.numel() for m in model.parameters()]) * 1e-6}" ) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and # compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) if config["smoke_test"]: logger.info( "Reduce the size of training and test dataloader as smoke_test=True" ) def get_batches(loader): loader_iter = iter(loader) return [next(loader_iter) for _ in range(5)] train_loader = get_batches(train_loader) test_loader = get_batches(test_loader) if config["with_pbar"] and rank == 0: ProgressBar(desc="Evaluation (train)", persist=False).attach(train_evaluator) ProgressBar(desc="Evaluation (val)", persist=False).attach(evaluator) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation, ) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 1 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=1, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler, ) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]: rank = idist.get_rank() if config.seed is not None: manual_seed(config.seed + rank) logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank) log_basic_info(logger, config) if rank == 0: prepare_output_directory(config) logger.info("Output path: {}".format(config.output_path)) weak_label_mgr = get_weak_label_manager(config) # Setup dataflow, model, optimizer, criterion data_loaders = get_dataflow(config, weak_label_mgr) train_loader = data_loaders["train"] config.num_iters_per_epoch = len(train_loader) model, optimizer, criterion = initialize(config, weak_label_mgr) metrics = get_metrics(criterion) trainer, evaluators = create_trainer_and_evaluators( model, optimizer, criterion, data_loaders, metrics, config, logger ) if rank == 0: tb_logger = common.setup_tb_logging( config.output_path, trainer, optimizer, evaluators=evaluators ) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluators["val"], models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) state_at_best_val = StateAtBestVal( score_function=lambda: evaluators["val"].state.metrics["accuracy"], state_function=lambda: dict( {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()}, **{ "test_" + key: val for key, val in evaluators["test"].state.metrics.items() }, epoch=trainer.state.epoch ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val) try: trainer.run(train_loader, max_epochs=config.num_epochs) except Exception: import traceback print(traceback.format_exc()) else: assert state_at_best_val.best_state is not None tb_logger.writer.add_hparams( # type: ignore get_hparams(config), {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()}, ) finally: if rank == 0: tb_logger.close() # type: ignore return evaluators["val"].state.metrics
def run( data_path="/tmp/MNIST", seed=3321, mode="xentropy", noise_fraction=0.35, batch_size=64, val_batch_size=1000, num_epochs=50, lr=0.01, momentum=0.5, as_pseudo_label=None, log_dir="/tmp/output-bootstraping-loss/mnist/", with_trains=False, ): """Training on noisy labels with bootstrapping Args: data_path (str): Path to MNIST dataset. Default, "/tmp/MNIST" seed (int): Random seed to setup. Default, 3321 mode (str): Loss function mode: cross-entropy or bootstrapping (soft, hard). Choices 'xentropy', 'soft_bootstrap', 'hard_bootstrap'. noise_fraction (float): Label noise fraction. Default, 0.35. batch_size (int): Input batch size for training. Default, 64. val_batch_size (int): input batch size for validation. Default, 1000. num_epochs (int): Number of epochs to train. Default, 50. lr (float): Learning rate. Default, 0.01. momentum (float): SGD momentum. Default, 0.5. log_dir (str): Log directory for Tensorboard log output. Default="/tmp/output-bootstraping-loss/mnist/". with_trains (bool): if True, experiment Trains logger is setup. Default, False. """ assert torch.cuda.is_available(), "Training should running on GPU" device = "cuda" manual_seed(seed) logger = setup_logger(name="MNIST-Training") now = datetime.now().strftime("%Y%m%d-%H%M%S") # Setup output path suffix = "" if mode == "soft_bootstrap" and (as_pseudo_label is not None and not as_pseudo_label): suffix = "as_xreg" output_path = Path(log_dir) / "train_{}_{}_{}_{}__{}".format( mode, noise_fraction, suffix, now, num_epochs) if not output_path.exists(): output_path.mkdir(parents=True) parameters = { "seed": seed, "mode": mode, "noise_fraction": noise_fraction, "batch_size": batch_size, "num_epochs": num_epochs, "lr": lr, "momentum": momentum, "as_pseudo_label": as_pseudo_label, } log_basic_info(logger, parameters) if with_trains: from trains import Task task = Task.init("BootstrappingLoss - Experiments on MNIST", task_name=output_path.name) # Log hyper parameters task.connect(parameters) train_loader, test_loader = get_data_loaders(data_path, noise_fraction, batch_size, val_batch_size) model = Net().to(device) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) if mode == 'xentropy': criterion = nn.CrossEntropyLoss() elif mode == 'soft_bootstrap': if as_pseudo_label is None: as_pseudo_label = True criterion = SoftBootstrappingLoss(beta=0.95, as_pseudo_label=as_pseudo_label) elif mode == 'hard_bootstrap': criterion = HardBootstrappingLoss(beta=0.8) else: raise ValueError( "Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap" .format(mode)) trainer = create_supervised_trainer(model, optimizer, criterion, device=device, non_blocking=True) metrics = { "Accuracy": Accuracy(), "{} loss".format(mode): Loss(criterion), } if mode is not "xentropy": metrics["xentropy loss"] = Loss(nn.CrossEntropyLoss()) evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED | Events.COMPLETED, run_validation) evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) trainer.run(train_loader, max_epochs=num_epochs) test_acc = evaluator.state.metrics["Accuracy"] tb_logger.writer.add_hparams(parameters, {"hparam/test_accuracy": test_acc}) tb_logger.close() return (mode, noise_fraction, as_pseudo_label, test_acc)
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any): """function to be run by idist.Parallel context manager.""" # ---------------------- # make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) # ----------------------- # create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.dataset}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- train_dataset, num_channels = get_datasets(config.dataset, config.data_path) train_dataloader = idist.auto_dataloader( train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ device = idist.device() netD, netG, optimizerD, optimizerG, loss_fn, lr_scheduler = initialize(config, num_channels) # ----------------------------- # trainer and evaluator # ----------------------------- ws = idist.get_world_size() real_labels = torch.ones(config.batch_size // ws, device=device) fake_labels = torch.zeros(config.batch_size // ws, device=device) fixed_noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device) trainer = create_trainers( config=config, netD=netD, netG=netG, optimizerD=optimizerD, optimizerG=optimizerG, loss_fn=loss_fn, device=device, real_labels=real_labels, fake_labels=fake_labels, ) # ------------------------------------------- # setup engines logger with python logging # print training configurations # ------------------------------------------- logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {'netD': netD, 'netG': netG, 'optimizerD': optimizerD, 'optimizerG': optimizerG, 'trainer': trainer} optimizers = {'optimizerD': optimizerD, 'optimizerG': optimizerG} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model={'netD', netD, 'netG', netG}, trainer=trainer, evaluator=trainer, metric_name='errD', es_metric_name='errD', to_save=to_save, lr_scheduler=lr_scheduler, output_names=["errD", "errG", "D_x", "D_G_z1", "D_G_z2"], ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger(config=config, trainer=trainer, optimizers=optimizers) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------------------------- # adding handlers using `trainer.on` decorator API # -------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def save_fake_example(engine): fake = netG(fixed_noise) path = config.output_dir / (FAKE_IMG_FNAME.format(engine.state.epoch)) vutils.save_image(fake.detach(), path, normalize=True) # -------------------------------------------------- # adding handlers using `trainer.on` decorator API # -------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def save_real_example(engine): img, y = engine.state.batch path = config.output_dir / (REAL_IMG_FNAME.format(engine.state.epoch)) vutils.save_image(img, path, normalize=True) # ------------------------------------------------------------- # adding handlers using `trainer.on` decorator API # ------------------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def print_times(engine): if not timer_handler: logger.info(f"Epoch {engine.state.epoch} done. Time per batch: {timer_handler.value():.3f}[s]") timer_handler.reset() @trainer.on(Events.ITERATION_COMPLETED(every=config.log_every_iters)) @idist.one_rank_only() def print_logs(engine): fname = config.output_dir / LOGS_FNAME columns = ["iteration", ] + list(engine.state.metrics.keys()) values = [str(engine.state.iteration), ] + [str(round(value, 5)) for value in engine.state.metrics.values()] with open(fname, "a") as f: if f.tell() == 0: print("\t".join(columns), file=f) print("\t".join(values), file=f) message = f"[{engine.state.epoch}/{config.max_epochs}][{engine.state.iteration % len(train_dataloader)}/{len(train_dataloader)}]" for name, value in zip(columns, values): message += f" | {name}: {value}" # ------------------------------------------------------------- # adding handlers using `trainer.on` decorator API # ------------------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def create_plots(engine): try: import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import pandas as pd except ImportError: warnings.warn("Loss plots will not be generated -- pandas or matplotlib not found") else: df = pd.read_csv(config.output_dir / LOGS_FNAME, delimiter="\t", index_col="iteration") _ = df.plot(subplots=True, figsize=(20, 20)) _ = plt.xlabel("Iteration number") fig = plt.gcf() path = config.output_dir / PLOT_FNAME fig.savefig(path) # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
splitter.is_fitted = True splitter.save(base_transforms) logging.info(f"Splitter fitted and saved. [{splitter}]") if not synthesizer.is_fitted: synthesizer.is_fitted = True synthesizer.save() logging.info(f"Synthesizer fitted and saved. [{synthesizer}]") all_dataset_indices = splitter.all_dataset_indices all_groups = splitter.all_groups ############# extract from fitted transformer ############### num_edge_types = next(x for x in base_transforms.transforms if isinstance(x, T.NodeConnector)).num_edge_types logging.info(f"#Edge-Types: {num_edge_types}") ################################### manual_seed(seed) # set seed, ignite function group_iterator = splitter.split() training_status_meta_dict = get_training_status_meta_dict( splitter.path_to_dir, args.model, args.num_splits, args.epochs) for split_idx, (train_groups, val_groups, test_groups) in enumerate(group_iterator): logging.info(f"Split: {split_idx + 1}") training_status_meta = training_status_meta_dict.get(split_idx) logging.info(f"Training status meta: {training_status_meta}") if training_status_meta.get( "finished", False) or (args.target_splits and (split_idx + 1) not in args.target_splits):
def train(cfg: DictConfig) -> None: # Determine device (GPU, CPU, etc.) device = "cuda" if torch.cuda.is_available() else "cpu" # Model model = get_network(cfg) # Data Loaders (select example) if cfg.example == "classification": train_loader, val_loader = get_dataloaders( cfg, num_workers=cfg.data_loader_workers, dataset_name="mnist") # Your training loop trainer = create_classification_training_loop(model, cfg, "trainer", device=device) # Your evaluation loop evaluator = create_classification_evaluation_loop(model, cfg, "evaluator", device=device) else: train_loader, val_loader = get_dataloaders( cfg, num_workers=cfg.data_loader_workers, dataset_name="reunion") # Your training loop trainer = create_regression_training_loop(model, cfg, "trainer", device=device) # Your evaluation loop evaluator = create_regression_evaluation_loop(model, cfg, "evaluator", device=device) ld = LogDirector(cfg, engines=[trainer, evaluator]) # Set configuration defined random seed manual_seed(cfg.random_seed) ######################################################################## # Logging Callbacks ######################################################################## # Helper to run the evaluation loop def run_evaluator(): evaluator.run(val_loader) # NOTE: Must return the engine we want to log from return evaluator if cfg.example == "regression": ld.set_event_handlers( trainer, Events.EPOCH_COMPLETED(every=1), EngineStateAttr.OUTPUT, log_operations=[ (LOG_OP.LOG_MESSAGE, ["loss"]), (LOG_OP.SAVE_IN_DATA_FILE, ["loss"]), ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="loss", plot_key="p1", split="mse", env="main", opts={ "title": "Train error (loss) ", "x_label": "Iters", "y_label": "nll", }, ) ], ), ], ) ld.set_event_handlers( trainer, Events.EPOCH_COMPLETED(every=20), EngineStateAttr.METRICS, engine_producer=run_evaluator, log_operations=[ ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="loss", plot_key="p2", split="mse", opts={ "title": "Eval error (loss)", "x_label": "Epoch", "y_label": "Loss", }, env="main", ) ], ), ( LOG_OP.NUMBER_TO_VISDOM, [ VisPlot( var_name="mape", plot_key="p3", split="MAPE", opts={ "title": "Eval error (MAPE)", "x_label": "Epoch", "y_label": "MAPE", }, env="main", ) ], ), ], ) def score_function(engine): return evaluator.state.metrics["mape"] handler = ModelCheckpoint("models/", "checkpoint", score_function=score_function, n_saved=5) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=50), handler, {"mlp": model}) ld.set_event_handlers( trainer, Events.EPOCH_COMPLETED(every=100), EngineStateAttr.OUTPUT, engine_producer=run_evaluator, log_operations=[( LOG_OP.VECTOR_TO_VISDOM, [ VisPlot( var_name="ypred_first", plot_key="e", split="nll", opts={ "title": "Predictions vs. Ground Truth, # ", "x_label": "Time", "y_label": "Energy Consumption", }, env="main", ) ], )], ) if cfg.example == "classification": ld.set_event_handlers( trainer, Events.ITERATION_COMPLETED(every=50), EngineStateAttr.OUTPUT, do_before_logging=postprocess_image_to_log, log_operations=[ (LOG_OP.SAVE_IMAGE, ["im"]), # Save images to a folder (LOG_OP.LOG_MESSAGE, ["nll" ]), # Log fields as message in logfile ( LOG_OP.SAVE_IN_DATA_FILE, ["nll"], ), # Log fields as separate data files ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="nll", plot_key="p1", split="nll_1", # Any opts that Visdom supports opts={ "title": "Plot 1", "xlabel": "Iters", "fillarea": True, }, ), VisPlot(var_name="nll_2", plot_key="p1", split="nll_2"), ], ), ( LOG_OP.IMAGE_TO_VISDOM, [ VisImg( var_name="im", img_key="1", env="images", opts={ "caption": "a current image", "title": "title" }, ), VisImg( var_name="im", img_key="2", env="images", opts={ "caption": "a current image", "title": "title" }, ), ], ), ], ) ld.set_event_handlers( trainer, Events.EPOCH_COMPLETED, EngineStateAttr.METRICS, # Run the evaluation loop, then do log operations from the return engine engine_producer=run_evaluator, log_operations=[ ( LOG_OP.LOG_MESSAGE, ["nll", "accuracy"], ), # Log fields as message in logfile ( LOG_OP.SAVE_IN_DATA_FILE, ["accuracy"], ), # Log fields as separate data files ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="accuracy", plot_key="p3", split="acc", # Any opts that Visdom supports opts={ "title": "Eval Acc", "xlabel": "Iters" }, ), # First plot, key is "p1" VisPlot( var_name="nll", plot_key="p4", split="nll", # Any opts that Visdom supports opts={ "title": "Eval Nll", "xlabel": "Iters", "fillarea": True, }, ), ], ), ], ) # Execute training if cfg.example == "regression": trainer.run(train_loader, max_epochs=1000) else: trainer.run(train_loader, max_epochs=cfg.mode.train.max_epochs)
def training(local_rank, config, logger, with_clearml): rank = idist.get_rank() manual_seed(config.seed + local_rank) train_loader = config.train_loader val_loader = config.val_loader train_eval_loader = config.train_eval_loader model, optimizer, criterion = utils.initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train") val_interval = config.get("val_interval", 1) # Run validation on every val_interval epoch, in the end of the training # and in the begining if config.start_by_validation is True event = Events.EPOCH_COMPLETED(every=val_interval) if config.num_epochs % val_interval != 0: event |= Events.COMPLETED if config.get("start_by_validation", False): event |= Events.STARTED @trainer.on(event) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) score_metric_name = "mIoU_bg" if "es_patience" in config: common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 2 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=2, trainer=trainer, tag="val", ) # Setup Tensorboard logger if rank == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}, ) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 # Image denormalization function to plot predictions with images mean = config.get("mean", (0.485, 0.456, 0.406)) std = config.get("std", (0.229, 0.224, 0.225)) img_denormalize = partial(data.denormalize, mean=mean, std=std) tb_logger.attach( evaluator, log_handler=vis.predictions_gt_images_handler( img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation", ), event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), ) # Log confusion matrix to ClearML: if with_clearml: trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close()
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any): """function to be run by idist.Parallel context manager.""" # ---------------------- # make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) # ----------------------- # create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.dataset}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- # TODO : PLEASE provide your custom datasets and dataloaders configurations # we can use `idist.auto_dataloader` to handle distributed configurations # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader train_dataset, eval_dataset = get_datasets() train_dataloader = idist.auto_dataloader(train_dataset, **kwargs) eval_dataloader = idist.auto_dataloader(eval_dataset, **kwargs) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ device = idist.device() model, optimizer, loss_fn, lr_scheduler = initialize() # ----------------------------- # trainer and evaluator # ----------------------------- trainer, evaluator = create_trainers( config=config, model=model, optimizer=optimizer, loss_fn=loss_fn, device=device, ) # ------------------------------------------- # update config with optimizer parameters # setup engines logger with python logging # print training configurations # ------------------------------------------- config.__dict__.update(**optimizer.defaults) logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger evaluator.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model=model, trainer=trainer, evaluator=evaluator, metric_name=None, # TODO : replace with the metric name to save the best model # if you check `Save the best model by evaluation score` otherwise leave it None # metric must be in evaluator.state.metrics. es_metric_name=None, # TODO : replace with the metric name to early stop # if you check `Early stop the training by evaluation score` otherwise leave it None # metric must be in evaluator.state.metrics. to_save=to_save, lr_scheduler=lr_scheduler, output_names=None, ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger( config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer ) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------------------- # let's trigger custom events we registered # we will use a `event_filter` to trigger that # `event_filter` has to return boolean # whether this event should be executed # here will log the gradients on the 1st iteration # and every 100 iterations # -------------------------------------------- @trainer.on(TrainEvents.BACKWARD_COMPLETED(lambda _, ev: (ev % 100 == 0) or (ev == 1))) def _(): # do something interesting pass # ---------------------------------------- # here we will use `every` to trigger # every 100 iterations # ---------------------------------------- @trainer.on(TrainEvents.OPTIM_STEP_COMPLETED(every=100)) def _(): # do something interesting pass # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # --------------------------------------------- # run evaluation at every training epoch end # with shortcut `on` decorator API and # print metrics to the stderr # again with `add_event_handler` API # for evaluation stats # --------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED(every=1)) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, "eval") # -------------------------------------------------- # let's try run evaluation first as a sanity check # -------------------------------------------------- @trainer.on(Events.STARTED) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ # TODO : PLEASE provide `max_epochs` parameters trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
def run( train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir, checkpoint_every, resume_from, crash_iteration=-1, deterministic=False, ): # Setup seed to have same model's initialization: manual_seed(75) train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(log_dir=log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer criterion = nn.NLLLoss() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5) # Setup trainer and evaluator if deterministic: tqdm.write("Setup deterministic trainer") trainer = create_supervised_trainer(model, optimizer, criterion, device=device, deterministic=deterministic) evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(criterion) }, device=device) # Apply learning rate scheduling @trainer.on(Events.EPOCH_COMPLETED) def lr_step(engine): lr_scheduler.step() pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=f"Epoch {0} - loss: {0:.4f} - lr: {lr:.4f}") @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): lr = optimizer.param_groups[0]["lr"] pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr:.4f}" pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) writer.add_scalar("lr", lr, engine.state.iteration) if crash_iteration > 0: @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration)) def _(engine): raise Exception(f"STOP at {engine.state.iteration}") if resume_from is not None: @trainer.on(Events.STARTED) def _(engine): pbar.n = engine.state.iteration % engine.state.epoch_length @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) # Compute and log validation metrics @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) # Setup object to checkpoint objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler } training_checkpoint = Checkpoint( to_save=objects_to_checkpoint, save_handler=DiskSaver(log_dir, require_empty=False), n_saved=None, global_step_transform=lambda *_: trainer.state.epoch, ) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=checkpoint_every), training_checkpoint) # Setup logger to print and dump into file: model weights, model grads and data stats # - first 3 iterations # - 4 iterations after checkpointing # This helps to compare resumed training with checkpointed training def log_event_filter(e, event): if event in [1, 2, 3]: return True elif 0 <= (event % (checkpoint_every * e.state.epoch_length)) < 5: return True return False fp = Path(log_dir) / ("run.log" if resume_from is None else "resume_run.log") fp = fp.as_posix() for h in [log_data_stats, log_model_weights, log_model_grads]: trainer.add_event_handler( Events.ITERATION_COMPLETED(event_filter=log_event_filter), h, model=model, fp=fp) if resume_from is not None: tqdm.write(f"Resume from the checkpoint: {resume_from}") checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) try: # Synchronize random states manual_seed(15) trainer.run(train_loader, max_epochs=epochs) except Exception as e: import traceback print(traceback.format_exc()) pbar.close() writer.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info(f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def training(local_rank, cfg): logger = setup_logger("FixMatch Training", distributed_rank=idist.get_rank()) if local_rank == 0: logger.info(cfg.pretty()) rank = idist.get_rank() manual_seed(cfg.seed + rank) device = idist.device() model, ema_model, optimizer, sup_criterion, lr_scheduler = utils.initialize(cfg) unsup_criterion = instantiate(cfg.solver.unsupervised_criterion) cta = get_default_cta() ( supervised_train_loader, test_loader, unsup_train_loader, cta_probe_loader, ) = utils.get_dataflow(cfg, cta=cta, with_unsup=True) def train_step(engine, batch): model.train() optimizer.zero_grad() x, y = batch["sup_batch"]["image"], batch["sup_batch"]["target"] if x.device != device: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) weak_x, strong_x = ( batch["unsup_batch"]["image"], batch["unsup_batch"]["strong_aug"], ) if weak_x.device != device: weak_x = weak_x.to(device, non_blocking=True) strong_x = strong_x.to(device, non_blocking=True) # according to TF code: single forward pass on concat data: [x, weak_x, strong_x] le = 2 * engine.state.mu_ratio + 1 # Why interleave: https://github.com/google-research/fixmatch/issues/20#issuecomment-613010277 # We need to interleave due to multiple-GPU batch norm issues. Let's say we have to GPUs, and our batch is # comprised of labeled (L) and unlabeled (U) images. Let's use a batch size of 2 for making easier visually # in my following example. # # - Without interleaving, we have a batch LLUUUUUU...U (there are 14 U). When the batch is split to be passed # to both GPUs, we'll have two batches LLUUUUUU and UUUUUUUU. Note that all labeled examples ended up in batch1 # sent to GPU1. The problem here is that batch norm will be computed per batch and the moments will lack # consistency between batches. # # - With interleaving, by contrast, the two batches will be LUUUUUUU and LUUUUUUU. As you can notice the # batches have the same distribution of labeled and unlabeled samples and will therefore have more consistent # moments. # x_cat = interleave(torch.cat([x, weak_x, strong_x], dim=0), le) y_pred_cat = model(x_cat) y_pred_cat = deinterleave(y_pred_cat, le) idx1 = len(x) idx2 = idx1 + len(weak_x) y_pred = y_pred_cat[:idx1, ...] y_weak_preds = y_pred_cat[idx1:idx2, ...] # logits_weak y_strong_preds = y_pred_cat[idx2:, ...] # logits_strong # supervised learning: sup_loss = sup_criterion(y_pred, y) # unsupervised learning: y_weak_probas = torch.softmax(y_weak_preds, dim=1).detach() y_pseudo = y_weak_probas.argmax(dim=1) max_y_weak_probas, _ = y_weak_probas.max(dim=1) unsup_loss_mask = ( max_y_weak_probas >= engine.state.confidence_threshold ).float() unsup_loss = ( unsup_criterion(y_strong_preds, y_pseudo) * unsup_loss_mask ).mean() total_loss = sup_loss + engine.state.lambda_u * unsup_loss total_loss.backward() optimizer.step() return { "total_loss": total_loss.item(), "sup_loss": sup_loss.item(), "unsup_loss": unsup_loss.item(), "mask": unsup_loss_mask.mean().item(), # this should not be averaged for DDP } output_names = ["total_loss", "sup_loss", "unsup_loss", "mask"] trainer = trainers.create_trainer( train_step, output_names=output_names, model=model, ema_model=ema_model, optimizer=optimizer, lr_scheduler=lr_scheduler, supervised_train_loader=supervised_train_loader, test_loader=test_loader, cfg=cfg, logger=logger, cta=cta, unsup_train_loader=unsup_train_loader, cta_probe_loader=cta_probe_loader, ) trainer.state.confidence_threshold = cfg.ssl.confidence_threshold trainer.state.lambda_u = cfg.ssl.lambda_u trainer.state.mu_ratio = cfg.ssl.mu_ratio distributed = idist.get_world_size() > 1 @trainer.on(Events.ITERATION_COMPLETED(every=cfg.ssl.cta_update_every)) def update_cta_rates(): batch = trainer.state.batch x, y = batch["cta_probe_batch"]["image"], batch["cta_probe_batch"]["target"] if x.device != device: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) policies = batch["cta_probe_batch"]["policy"] ema_model.eval() with torch.no_grad(): y_pred = ema_model(x) y_probas = torch.softmax(y_pred, dim=1) # (N, C) if distributed: for y_proba, t, policy in zip(y_probas, y, policies): error = y_proba error[t] -= 1 error = torch.abs(error).sum() cta.update_rates(policy, 1.0 - 0.5 * error.item()) else: error_per_op = [] for y_proba, t, policy in zip(y_probas, y, policies): error = y_proba error[t] -= 1 error = torch.abs(error).sum() for k, bins in policy: error_per_op.append(pack_as_tensor(k, bins, error)) error_per_op = torch.stack(error_per_op) # all gather tensor_list = idist.all_gather(error_per_op) # update cta rates for t in tensor_list: k, bins, error = unpack_from_tensor(t) cta.update_rates([(k, bins),], 1.0 - 0.5 * error) epoch_length = cfg.solver.epoch_length num_epochs = cfg.solver.num_epochs if not cfg.debug else 2 try: trainer.run( supervised_train_loader, epoch_length=epoch_length, max_epochs=num_epochs ) except Exception as e: import traceback print(traceback.format_exc())
def training(local_rank, config): config["device"] = "cuda" if config["active_gpu_ids"] else "cpu" rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="Carbon Black Semantic Segmentation Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = utils.get_time_stamp() else: now = f"stop-on-{config['stop_iteration']}" folder_name = ( f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_" f"backend-{idist.backend()}-{idist.get_world_size()}_{now}") output_path = Path(output_path) / folder_name output_path.mkdir(parents=True, exist_ok=True) config["output_path"] = output_path.as_posix() config["task_name"] = output_path.stem logger.info(f"Output path: {output_path}") if "cuda" in idist.device().type: config["cuda_device_name"] = torch.cuda.get_device_name(local_rank) setup_trains_logging(config) dataloader_train, dataloader_val = get_dataloaders(config) config["num_iterations_per_epoch"] = len(dataloader_train) config["num_epochs"] = round(config["num_iterations"] / config["num_iterations_per_epoch"]) model = modeling.get_model(config) optimizer = get_optimizer(model, config) loss = get_loss() lr_scheduler = get_lr_scheduler(optimizer, config) trainer = create_trainer(model, optimizer, loss, lr_scheduler, dataloader_train.sampler, config, logger) metrics = get_metrics(loss) # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) evaluator_train = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": evaluator_train, "validation": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) example_prediction_logger = ExamplePredictionLogger( tb_logger, model, device) def run_validation(engine): epoch = trainer.state.epoch state = evaluator_train.run(dataloader_train) data_subset = "Train" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) state = evaluator.run(dataloader_val) data_subset = "Val" log_metrics(logger, epoch, state.times["COMPLETED"], data_subset, state.metrics) log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics) example_prediction_logger.log_visualization(dataloader_val.dataset, epoch) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="validation", ) # TODO: Add early stopping # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() # noinspection PyBroadException try: trainer.run(dataloader_train, max_epochs=config["num_epochs"]) except Exception: import traceback print(traceback.format_exc()) if rank == 0: # noinspection PyUnboundLocalVariable tb_logger.close()
def run(output_path, config): distributed = dist.is_available() and dist.is_initialized() rank = dist.get_rank() if distributed else 0 manual_seed(config["seed"] + rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = utils.get_dataflow(config, distributed) model, optimizer = utils.get_model_optimizer(config, distributed) criterion = nn.CrossEntropyLoss().to(utils.device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # Setup Ignite trainer: # - let's define training step # - add other common handlers: # - TerminateOnNan, # - handler to setup learning rate scheduling, # - ModelCheckpoint # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations def train_step(engine, batch): x = convert_tensor(batch[0], device=utils.device, non_blocking=True) y = convert_tensor(batch[1], device=utils.device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } if config["deterministic"] and rank == 0: print("Setup deterministic trainer") trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step) train_sampler = train_loader.sampler if distributed else None to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: # Setup Tensorboard logger - wrapper on SummaryWriter tb_logger = TensorboardLogger(log_dir=output_path) # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) # log optimizer's parameters: "lr" every iteration tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED ) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(device=utils.device if distributed else None), "loss": Loss(criterion, device=utils.device if distributed else None), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) def run_validation(engine): train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: # Setup progress bar on evaluation engines if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # Optionally log model gradients if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]), ) # In order to check training resuming we can emulate a crash if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format(engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-QAT-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path=config["output_path"], evaluator=evaluator, model=model, metric_name="Accuracy", n_saved=1, trainer=trainer, tag="test", ) trainer.run(train_loader, max_epochs=config["num_epochs"]) if rank == 0: tb_logger.close()
def _test_distrib_integration(device, atol=1e-8): rank = idist.get_rank() n_iters = 100 s = 10 offset = n_iters * s # test for float manual_seed(42) y_pred = torch.rand(offset * idist.get_world_size(), 3, 28, 28, device=device) y = y_pred * 0.65 data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol) # test for YCbCr manual_seed(42) y_pred = torch.randint(16, 236, (offset * idist.get_world_size(), 1, 12, 12), dtype=torch.uint8, device=device) cbcr_pred = torch.randint(16, 241, (offset * idist.get_world_size(), 2, 12, 12), dtype=torch.uint8, device=device) y = torch.randint(16, 236, (offset * idist.get_world_size(), 1, 12, 12), dtype=torch.uint8, device=device) cbcr = torch.randint(16, 241, (offset * idist.get_world_size(), 2, 12, 12), dtype=torch.uint8, device=device) y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1) data_range = (y[:, 0, ...].max() - y[:, 0, ...].min()).cpu().item() _test( y_pred=y_pred, y=y, data_range=data_range, metric_device="cpu", n_iters=n_iters, s=s, offset=offset, rank=rank, atol=atol, output_transform=lambda x: (x[0][:, 0, ...], x[1][:, 0, ...]), compute_y_channel=True, ) # test for uint8 manual_seed(42) y_pred = torch.randint(0, 256, (offset * idist.get_world_size(), 3, 16, 16), device=device, dtype=torch.uint8) y = (y_pred * 0.65).to(torch.uint8) data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol) # test with NHW shape manual_seed(42) y_pred = torch.rand(offset * idist.get_world_size(), 28, 28, device=device) y = y_pred * 0.8 data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol) if torch.device(device).type != "xla": manual_seed(42) y_pred = torch.rand(offset * idist.get_world_size(), 3, 28, 28, device=device) y = y_pred * 0.65 data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, idist.device(), n_iters, s, offset, rank, atol=atol) # test for YCbCr manual_seed(42) y_pred = torch.randint(16, 236, (offset * idist.get_world_size(), 1, 12, 12), dtype=torch.uint8, device=device) cbcr_pred = torch.randint(16, 241, (offset * idist.get_world_size(), 2, 12, 12), dtype=torch.uint8, device=device) y = torch.randint(16, 236, (offset * idist.get_world_size(), 1, 12, 12), dtype=torch.uint8, device=device) cbcr = torch.randint(16, 241, (offset * idist.get_world_size(), 2, 12, 12), dtype=torch.uint8, device=device) y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1) data_range = (y[:, 0, ...].max() - y[:, 0, ...].min()).cpu().item() _test( y_pred=y_pred, y=y, data_range=data_range, metric_device=idist.device(), n_iters=n_iters, s=s, offset=offset, rank=rank, atol=atol, output_transform=lambda x: (x[0][:, 0, ...], x[1][:, 0, ...]), compute_y_channel=True, ) manual_seed(42) y_pred = torch.randint(0, 256, (offset * idist.get_world_size(), 3, 16, 16), device=device, dtype=torch.uint8) y = (y_pred * 0.65).to(torch.uint8) data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, idist.device(), n_iters, s, offset, rank, atol=atol) # test with NHW shape manual_seed(42) y_pred = torch.rand(offset * idist.get_world_size(), 28, 28, device=device) y = y_pred * 0.8 data_range = (y.max() - y.min()).cpu().item() _test(y_pred, y, data_range, idist.device(), n_iters, s, offset, rank, atol=atol)
def _train(save_iter=None, save_epoch=None, sd=None): w_norms = [] grad_norms = [] data = [] chkpt = [] manual_seed(12) arch = [ nn.Conv2d(3, 10, 3), nn.ReLU(), nn.Conv2d(10, 10, 3), nn.ReLU(), nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(10, 5), nn.ReLU(), nn.Linear(5, 2), ] if with_dropout: arch.insert(2, nn.Dropout2d()) arch.insert(-2, nn.Dropout()) model = nn.Sequential(*arch).to(device) opt = SGD(model.parameters(), lr=0.001) def proc_fn(e, b): from ignite.engine.deterministic import _get_rng_states, _repr_rng_state s = _repr_rng_state(_get_rng_states()) model.train() opt.zero_grad() y = model(b.to(device)) y.sum().backward() opt.step() if debug: print(trainer.state.iteration, trainer.state.epoch, "proc_fn - b.shape", b.shape, torch.norm(y).item(), s) trainer = DeterministicEngine(proc_fn) if save_iter is not None: ev = Events.ITERATION_COMPLETED(once=save_iter) elif save_epoch is not None: ev = Events.EPOCH_COMPLETED(once=save_epoch) save_iter = save_epoch * (data_size // batch_size) @trainer.on(ev) def save_chkpt(_): if debug: print(trainer.state.iteration, "save_chkpt") fp = dirname / "test.pt" from ignite.engine.deterministic import _repr_rng_state tsd = trainer.state_dict() if debug: print("->", _repr_rng_state(tsd["rng_states"])) torch.save([model.state_dict(), opt.state_dict(), tsd], fp) chkpt.append(fp) def log_event_filter(_, event): if (event // save_iter == 1) and 1 <= (event % save_iter) <= 5: return True return False @trainer.on(Events.ITERATION_COMPLETED(event_filter=log_event_filter)) def write_data_grads_weights(e): x = e.state.batch i = e.state.iteration data.append([i, x.mean().item(), x.std().item()]) total = [0.0, 0.0] out1 = [] out2 = [] for p in model.parameters(): n1 = torch.norm(p).item() n2 = torch.norm(p.grad).item() out1.append(n1) out2.append(n2) total[0] += n1 total[1] += n2 w_norms.append([i, total[0]] + out1) grad_norms.append([i, total[1]] + out2) if sd is not None: sd = torch.load(sd) model.load_state_dict(sd[0]) opt.load_state_dict(sd[1]) from ignite.engine.deterministic import _repr_rng_state if debug: print("-->", _repr_rng_state(sd[2]["rng_states"])) trainer.load_state_dict(sd[2]) manual_seed(32) trainer.run(random_train_data_loader(size=data_size), max_epochs=5) return { "sd": chkpt, "data": data, "grads": grad_norms, "weights": w_norms }
def run(local_rank, config): # ---------------------- # Make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) device = idist.device() # ----------------------- # Create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.model}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- train_loader, test_loader = get_dataflow(config) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ config.num_iters_per_epoch = len(train_loader) model, optimizer, loss_fn, lr_scheduler = initialize(config) # ----------------------------- # trainer and evaluator # ----------------------------- trainer, evaluator = create_trainers( config=config, model=model, optimizer=optimizer, loss_fn=loss_fn, device=device, ) # --------------------------------- # attach metrics to evaluator # --------------------------------- metrics = { "eval_accuracy": Accuracy(output_transform=thresholded_output_transform, device=device), "eval_loss": Loss(loss_fn, device=device), } for name, metric in metrics.items(): metric.attach(evaluator, name) # ------------------------------------------- # setup engines logger with python logging # print training configurations # ------------------------------------------- logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger evaluator.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model=model, trainer=trainer, evaluator=evaluator, metric_name="eval_accuracy", es_metric_name="eval_accuracy", to_save=to_save, lr_scheduler=lr_scheduler, output_names=None, ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger( config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer ) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # --------------------------------------------- # run evaluation at every training epoch end # with shortcut `on` decorator API and # print metrics to the stderr # again with `add_event_handler` API # for evaluation stats # --------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED(every=config.validate_every)) def _(): evaluator.run(test_loader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, tag="eval") # -------------------------------------------------- # let's try run evaluation first as a sanity check # -------------------------------------------------- @trainer.on(Events.STARTED) def _(): evaluator.run(test_loader, epoch_length=config.eval_epoch_length) # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ trainer.run(train_loader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)