def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_trains"]: from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def create_trainer(loader, model, opt, loss_fn, device, args): def _update(engine, batch): model.train() x = batch['x'].to(engine.state.device, non_blocking=True) y = batch['y'].to(engine.state.device, non_blocking=True) m = batch['m'].to(engine.state.device, non_blocking=True) opt.zero_grad() y_pred = model(x) softmax = nn.Softmax() masked_loss = softmax(y_pred) #masked_loss = y_pred*m loss = loss_fn(masked_loss, y) if m.sum().item() / m.numel() > 0.7: loss.backward() opt.step() masked_loss = (masked_loss>0.5).float() acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:]) return { 'x': x.detach(), 'y': y.detach(), 'm': m.detach(), 'y_pred': y_pred.detach(), 'loss': loss.item(), 'acc' : acc } def _inference(engine, batch): model.eval() with th.no_grad(): x = batch['x'].to(engine.state.device, non_blocking=True) y = batch['y'].to(engine.state.device, non_blocking=True) m = batch['m'].to(engine.state.device, non_blocking=True) y_pred = model(x) softmax = nn.Softmax(dim=1) masked_loss = softmax(y_pred) #masked_loss = y_pred*m loss = loss_fn(masked_loss, y) masked_loss = (masked_loss[-3:]>0.5).float() acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:]) return { 'x': x.detach(), 'y': y.detach(), 'm': m.detach(), 'y_pred': y_pred.detach(), 'loss': loss.item(), 'acc' : acc } #wandb.watch(model, log ='all') trainer = Engine(_update) evaluator = Engine(_inference) profiler = BasicTimeProfiler() profiler.attach(trainer) logdir = args.logdir save_ = (not args.devrun) and (not args.nosave) # initialize trainer state trainer.state.device = device trainer.state.hparams = args trainer.state.save = save_ trainer.state.logdir = logdir trainer.state.df = defaultdict(dict) trainer.state.metrics = dict() trainer.state.val_metrics = dict() trainer.state.best_metrics = defaultdict(list) trainer.state.gradnorm = defaultdict(dict) # initialize evaluator state evaluator.logger = setup_logger('evaluator') evaluator.state.device = device evaluator.state.df = defaultdict(dict) evaluator.state.metrics = dict() pbar = ProgressBar(persist=True) ebar = ProgressBar(persist=False) pbar.attach(trainer, ['loss']) ebar.attach(evaluator, ['loss']) pbar.attach(trainer,['acc']) ebar.attach(evaluator,['acc']) # model summary if args.model_summary: trainer.add_event_handler( Events.STARTED, print_model_summary, model ) # terminate on nan trainer.add_event_handler( Events.ITERATION_COMPLETED, TerminateOnNan(lambda x: x['loss']) ) # metrics trainer.add_event_handler( Events.ITERATION_COMPLETED, _metrics ) evaluator.add_event_handler( Events.ITERATION_COMPLETED, _metrics ) trainer.add_event_handler( Events.EPOCH_COMPLETED, _metrics_mean ) evaluator.add_event_handler( Events.COMPLETED, _metrics_mean ) trainer.add_event_handler( #Events.STARTED | Events.EPOCH_COMPLETED, Events.EPOCH_COMPLETED, _evaluate, evaluator, loader ) # logging trainer.add_event_handler( Events.EPOCH_COMPLETED, _log_metrics ) # early stopping if args.early_stopping > 0: es_p = args.early_stopping es_s = lambda engine: -engine.state.metrics['loss'] evaluator.add_event_handler( Events.COMPLETED, EarlyStopping(patience=es_p, score_function=es_s, trainer=trainer) ) # lr schedulers if args.epoch_length is None: el = len(loader['train']) else: el = args.epoch_length if args.lr_scheduler is not None: lr_sched = create_lr_scheduler(opt, args, num_steps=el) if args.lr_scheduler != 'plateau': def _sched_fun(engine): lr_sched.step() else: def _sched_fun(engine): e = engine.state.epoch v = engine.state.val_metrics[e]['nmse'] lr_sched.step(v) if args.lr_scheduler == 'linearcycle': trainer.add_event_handler(Events.ITERATION_STARTED, lr_sched) else: trainer.add_event_handler(Events.EPOCH_COMPLETED, _sched_fun) # FIXME: warmup is modifying opt base_lr -> must create last if args.lr_warmup > 0: wsched = create_lr_scheduler(opt, args, 'warmup', num_steps=el) wsts = wsched.total_steps trainer.add_event_handler( Events.ITERATION_COMPLETED(event_filter=lambda _, i: i <= wsts), lambda _: wsched.step() ) # saving if save_: to_save = { 'model': model, 'optimizer': opt, 'trainer': trainer, 'evaluator': evaluator } trainer.add_event_handler( Events.EPOCH_COMPLETED, Checkpoint(to_save, DiskSaver(logdir), n_saved=3) ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: -engine.state.metrics['nmae'], # score_name = 'val_nmae', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: -engine.state.metrics['nmse'], # score_name = 'val_nmse', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: engine.state.metrics['R2'], # score_name = 'val_R2', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) trainer.add_event_handler( Events.EPOCH_COMPLETED, _save_metrics ) # timer trainer.add_event_handler( Events.COMPLETED | Events.TERMINATE, lambda _: profiler.write_results(logdir + '/time.csv') ) return trainer
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.NLLLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=f"ITERATION - loss: {0:.2f}", ) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = f"ITERATION - loss: {engine.state.output:.2f}" pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(engine): tqdm.write( f"{trainer.last_event_name.name} took { trainer.state.times[trainer.last_event_name.name]} seconds" ) trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite") clearml_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training metrics", train_evaluator), ("validation metrics", validation_evaluator)]: clearml_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) clearml_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) clearml_logger.attach( trainer, log_handler=WeightsScalarHandler(model, whitelist=["fc1"]), event_name=Events.ITERATION_COMPLETED(every=100), ) def is_conv(n, _): return "conv" in n clearml_logger.attach( trainer, log_handler=WeightsHistHandler(model, whitelist=is_conv), event_name=Events.ITERATION_COMPLETED(every=100), ) clearml_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) clearml_logger.attach( trainer, log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]), event_name=Events.ITERATION_COMPLETED(every=100), ) handler = Checkpoint( {"model": model}, ClearMLSaver(), n_saved=1, score_function=lambda e: e.state.metrics["accuracy"], score_name="val_acc", filename_prefix="best", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler) # kick everything off trainer.run(train_loader, max_epochs=epochs) clearml_logger.close()
def run(): writer = SummaryWriter() CUDA = Config.device model = Retriever() print(f'Initializing model on {CUDA}') model.to(CUDA) optimizer = torch.optim.Adam(model.parameters(), lr=Config.LR) loss_fn = torch.nn.L1Loss().to(CUDA) print(f'Creating sentence transformer') encoder = SentenceTransformer(Config.sentence_transformer).to(CUDA) for parameter in encoder.parameters(): parameter.requires_grad = False print(f'Loading data') if os.path.exists('_full_dump'): with open('_full_dump', 'rb') as pin: train_loader, train_utts, val_loader, val_utts = pickle.load(pin) else: data = load_data(Config.data_source) train_loader, train_utts, val_loader, val_utts = get_loaders(data, encoder, Config.batch_size) with open('_full_dump', 'wb') as pout: pickle.dump((train_loader, train_utts, val_loader, val_utts), pout, protocol=-1) def train_step(engine, batch): model.train() optimizer.zero_grad() x, not_ys, y = batch yhat = model(x[0]) loss = loss_fn(yhat, y) gains = loss_fn(not_ys[0], yhat) * Config.negative_weight loss -= gains loss.backward() optimizer.step() return loss.item() def eval_step(engine, batch): model.eval() with torch.no_grad(): x, _, y = batch yhat = model(x[0]) return yhat, y trainer = Engine(train_step) trainer.logger = setup_logger('trainer') evaluator = Engine(eval_step) evaluator.logger = setup_logger('evaluator') latent_space = BallTree(numpy.array(list(train_utts.keys()))) l1 = Loss(loss_fn) recall = RecallAt(latent_space) recall.attach(evaluator, 'recall') l1.attach(evaluator, 'l1') @trainer.on(Events.ITERATION_COMPLETED(every=1000)) def log_training(engine): batch_loss = engine.state.output lr = optimizer.param_groups[0]['lr'] e = engine.state.epoch n = engine.state.max_epochs i = engine.state.iteration print("Epoch {}/{} : {} - batch loss: {}, lr: {}".format(e, n, i, batch_loss, lr)) writer.add_scalar('Training/loss', batch_loss, i) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics print(f"Training Results - Epoch: {engine.state.epoch} " f" L1: {metrics['l1']:.2f} " f" R@1: {metrics['r1']:.2f} " f" R@3: {metrics['r3']:.2f} " f" R@10: {metrics['r10']:.2f} ") for metric, value in metrics.items(): writer.add_scalar(f'Training/{metric}', value, engine.state.epoch) #@trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics print(f"Validation Results - Epoch: {engine.state.epoch} " f"L1: {metrics['l1']:.2f} " f" R@10: {metrics['r10']:.2f} ") for metric, value in metrics.items(): writer.add_scalar(f'Validation/{metric}', value, engine.state.epoch) trainer.run(train_loader, max_epochs=Config.max_epochs) torch.save(model.state_dict(), Config.checkpoint) print(f'Saved checkpoint at {Config.checkpoint}') interact(model, encoder, latent_space, train_utts)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="IMDB-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_dir"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_dir"] = output_path.as_posix() logger.info(f"Output path: {config['output_dir']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("IMDB-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "dropout", "n_fc", "batch_size", "max_length", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(output_transform=utils.thresholded_output_transform), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics, config, tag="val") train_evaluator = create_evaluator(model, metrics, config, tag="train") def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED | Events.STARTED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging( output_path, trainer, optimizer, evaluators=evaluators, log_every_iters=config["log_every_iters"]) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, utils.get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") if sys.version_info > (3,): from ignite.contrib.metrics.gpu_info import GpuInfo try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`" ) metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler(trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) tb_logger.attach( trainer, log_handler=WeightsScalarHandler(model, whitelist=["fc1"]), event_name=Events.ITERATION_COMPLETED(every=100), ) def is_conv(n, _): return "conv" in n tb_logger.attach( trainer, log_handler=WeightsHistHandler(model, whitelist=is_conv), event_name=Events.ITERATION_COMPLETED(every=100), ) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach( trainer, log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]), event_name=Events.ITERATION_COMPLETED(every=100), ) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) tb_logger.close()
def auto_dataloader(dataset, **kwargs): """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we create a dataloader with provided kwargs while applying the following updates: - batch size is scaled by world size: ``batch_size / world_size``. - number of workers is scaled by number of local processes: ``num_workers / nprocs``. - if no sampler provided by user, `torch DistributedSampler` is setup. - if a sampler is provided by user, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`. .. warning:: Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch sampler is compatible with distributed configuration. Examples: .. code-block:: python import ignite.distribted as idist train_loader = idist.auto_dataloader( train_dataset, batch_size=32, num_workers=4, shuffle=True, pin_memory="cuda" in idist.device().type, drop_last=True, ) Args: dataset (Dataset): input torch dataset **kwargs: keyword arguments for `torch DataLoader`_. Returns: `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178 .. _torch DistributedSampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler """ rank = idist.get_rank() world_size = idist.get_world_size() logger = setup_logger(__name__ + ".auto_dataloader") if world_size > 1: if "batch_size" in kwargs: kwargs["batch_size"] //= world_size if "num_workers" in kwargs: nproc = idist.get_nproc_per_node() kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc if "batch_sampler" not in kwargs: if kwargs.get("sampler", None) is not None: sampler = DistributedProxySampler(kwargs["sampler"], num_replicas=world_size, rank=rank) else: sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=kwargs.get( "shuffle", True)) # we need to remove "shuffle" from kwargs if sampler is used if "shuffle" in kwargs: del kwargs["shuffle"] kwargs["sampler"] = sampler else: warnings.warn( "Found batch_sampler in provided kwargs. Please, make sure that it is compatible " "with distributed configuration") if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and kwargs.get("pin_memory", False): # TODO: How about XLA GPU ? warnings.warn( "Found incompatible options: xla support and pin_memory args equal True. " "Argument `pin_memory=False` will be used to construct data loader." ) kwargs["pin_memory"] = False logger.info("Use data loader kwargs for dataset '{}': \n\t{}".format( repr(dataset)[:20].strip(), kwargs)) dataloader = DataLoader(dataset, **kwargs) if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and world_size > 1: logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA") mp_device_loader_cls = _MpDeviceLoader try: from torch_xla.distributed.parallel_loader import MpDeviceLoader mp_device_loader_cls = MpDeviceLoader except ImportError: pass sampler = dataloader.sampler dataloader = mp_device_loader_cls(dataloader, idist.device()) dataloader.sampler = sampler return dataloader
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]: rank = idist.get_rank() if config.seed is not None: manual_seed(config.seed + rank) logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank) log_basic_info(logger, config) if rank == 0: prepare_output_directory(config) logger.info("Output path: {}".format(config.output_path)) weak_label_mgr = get_weak_label_manager(config) # Setup dataflow, model, optimizer, criterion data_loaders = get_dataflow(config, weak_label_mgr) train_loader = data_loaders["train"] config.num_iters_per_epoch = len(train_loader) model, optimizer, criterion = initialize(config, weak_label_mgr) metrics = get_metrics(criterion) trainer, evaluators = create_trainer_and_evaluators( model, optimizer, criterion, data_loaders, metrics, config, logger ) if rank == 0: tb_logger = common.setup_tb_logging( config.output_path, trainer, optimizer, evaluators=evaluators ) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluators["val"], models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) state_at_best_val = StateAtBestVal( score_function=lambda: evaluators["val"].state.metrics["accuracy"], state_function=lambda: dict( {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()}, **{ "test_" + key: val for key, val in evaluators["test"].state.metrics.items() }, epoch=trainer.state.epoch ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val) try: trainer.run(train_loader, max_epochs=config.num_epochs) except Exception: import traceback print(traceback.format_exc()) else: assert state_at_best_val.best_state is not None tb_logger.writer.add_hparams( # type: ignore get_hparams(config), {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()}, ) finally: if rank == 0: tb_logger.close() # type: ignore return evaluators["val"].state.metrics
def run(mode, cfg): device = 'cuda' if cfg.SYSTEM.USE_CUDA else 'cpu' print(cfg.MODEL.NAME) model = ResNext() train_loader, val_loader, test_loader = get_data_loaders( cfg.TRAIN, cfg.EVALUATE, cfg.TEST, cfg.AUGMENT) if cfg.MODEL.CHECKPOINT: model.load_state_dict(torch.load(cfg.MODEL.CHECKPOINT)) print( f"Load {cfg.MODEL.NAME} weight ({cfg.MODEL.CHECKPOINT}) sucessfully!" ) loss = torch.nn.CrossEntropyLoss() pbar = ProgressBar() if mode == 'train': timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") dir_name = f'{timestamp}_{cfg.MODEL.NAME}{cfg.TAG}' writer = create_summary_writer(model, train_loader, f"runs/{dir_name}") optimizer = RAdam(model.parameters(), lr=cfg.OPTIM.INIT_LR) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, cfg.LR_SCHEDULER.STEP_SIZE) trainer = create_supervised_trainer(model, optimizer, loss, device) trainer.logger = setup_logger("trainer") pbar.attach(trainer) evaluator = create_supervised_evaluator(model, {"TOP_1": TOP_1()}, device) evaluator.logger = setup_logger("evaluator") pbar.attach(evaluator) model_saver = ModelCheckpoint(f"checkpoints/{dir_name}", f"{cfg.MODEL.NAME}{cfg.TAG}", n_saved=10, create_dir=True) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): trainer.logger.info(trainer.state) trainer.logger.info("Epoch[{}_{}] Loss: {:.2f}".format( trainer.state.epoch, trainer.state.iteration, trainer.state.output)) writer.add_scalar("training/loss", trainer.state.output, trainer.state.iteration) writer.add_scalar("training/lr", optimizer.param_groups[0]['lr'], trainer.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): model_saver(engine, {"model": model}) trainer.logger.info("Model saved!") scheduler.step() @trainer.on(Events.EPOCH_COMPLETED(every=cfg.EVAL_MODEL_EVERY_EPOCH)) def log_validation_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics evaluator.logger.info( "Training Results - Epoch: {} TOP_1: {:.2f}".format( trainer.state.epoch, metrics['TOP_1'])) writer.add_scalar("training/TOP_1", metrics['TOP_1'], trainer.state.iteration) evaluator.run(val_loader) metrics = evaluator.state.metrics evaluator.logger.info( "Validation Results - Epoch: {} TOP_1: {:.2f}".format( trainer.state.epoch, metrics['TOP_1'])) writer.add_scalar("validation/TOP_1", metrics['TOP_1'], trainer.state.iteration) trainer.run(train_loader, max_epochs=cfg.EPOCH) elif mode == 'infer': predictor = create_supervised_evaluator(model, {"Predict": Predict(cfg.TEST)}, device) pbar.attach(predictor) predictor.logger = setup_logger("predictor") predictor.run(test_loader) predictor.logger.info("Inference Done.") elif mode == 'eval': evaluator = create_supervised_evaluator(model, {"TOP_1": TOP_1()}, device) pbar.attach(evaluator) evaluator.logger = setup_logger('evaluator') evaluator.run(val_loader) metrics = evaluator.state.metrics evaluator.logger.info("Validation Results - TOP_1: {:.2f}".format( metrics['TOP_1']))
def run( data_path="/tmp/MNIST", seed=3321, mode="xentropy", noise_fraction=0.35, batch_size=64, val_batch_size=1000, num_epochs=50, lr=0.01, momentum=0.5, as_pseudo_label=None, log_dir="/tmp/output-bootstraping-loss/mnist/", with_trains=False, ): """Training on noisy labels with bootstrapping Args: data_path (str): Path to MNIST dataset. Default, "/tmp/MNIST" seed (int): Random seed to setup. Default, 3321 mode (str): Loss function mode: cross-entropy or bootstrapping (soft, hard). Choices 'xentropy', 'soft_bootstrap', 'hard_bootstrap'. noise_fraction (float): Label noise fraction. Default, 0.35. batch_size (int): Input batch size for training. Default, 64. val_batch_size (int): input batch size for validation. Default, 1000. num_epochs (int): Number of epochs to train. Default, 50. lr (float): Learning rate. Default, 0.01. momentum (float): SGD momentum. Default, 0.5. log_dir (str): Log directory for Tensorboard log output. Default="/tmp/output-bootstraping-loss/mnist/". with_trains (bool): if True, experiment Trains logger is setup. Default, False. """ assert torch.cuda.is_available(), "Training should running on GPU" device = "cuda" manual_seed(seed) logger = setup_logger(name="MNIST-Training") now = datetime.now().strftime("%Y%m%d-%H%M%S") # Setup output path suffix = "" if mode == "soft_bootstrap" and (as_pseudo_label is not None and not as_pseudo_label): suffix = "as_xreg" output_path = Path(log_dir) / "train_{}_{}_{}_{}__{}".format( mode, noise_fraction, suffix, now, num_epochs) if not output_path.exists(): output_path.mkdir(parents=True) parameters = { "seed": seed, "mode": mode, "noise_fraction": noise_fraction, "batch_size": batch_size, "num_epochs": num_epochs, "lr": lr, "momentum": momentum, "as_pseudo_label": as_pseudo_label, } log_basic_info(logger, parameters) if with_trains: from trains import Task task = Task.init("BootstrappingLoss - Experiments on MNIST", task_name=output_path.name) # Log hyper parameters task.connect(parameters) train_loader, test_loader = get_data_loaders(data_path, noise_fraction, batch_size, val_batch_size) model = Net().to(device) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) if mode == 'xentropy': criterion = nn.CrossEntropyLoss() elif mode == 'soft_bootstrap': if as_pseudo_label is None: as_pseudo_label = True criterion = SoftBootstrappingLoss(beta=0.95, as_pseudo_label=as_pseudo_label) elif mode == 'hard_bootstrap': criterion = HardBootstrappingLoss(beta=0.8) else: raise ValueError( "Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap" .format(mode)) trainer = create_supervised_trainer(model, optimizer, criterion, device=device, non_blocking=True) metrics = { "Accuracy": Accuracy(), "{} loss".format(mode): Loss(criterion), } if mode is not "xentropy": metrics["xentropy loss"] = Loss(nn.CrossEntropyLoss()) evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED | Events.COMPLETED, run_validation) evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) trainer.run(train_loader, max_epochs=num_epochs) test_acc = evaluator.state.metrics["Accuracy"] tb_logger.writer.add_hparams(parameters, {"hparam/test_accuracy": test_acc}) tb_logger.close() return (mode, noise_fraction, as_pseudo_label, test_acc)
non_blocking: bool = False): dict_inputs = dict() inputs, target_sequence = dict_inputs input_seq, input_pos, input_chuck = dict_inputs input_sequence, input_length = input_seq (x_tokens, x_length), x_pos, x_chunk = x_inputs return model = BiLSTMCRF(config).to(config.device) optimizer = torch.optim.Adam(params=model.parameters(), lr=config.learn.learning_rate, weight_decay=config.learn.weight_decay) criterion = SequenceCRFLoss(config).to(config.device) trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch=prepare_batch, device=config.device) # Create an object of the profiler and attach an engine to it profiler = BasicTimeProfiler() profiler.attach(trainer) trainer.logger = setup_logger("trainer") trainer.run(train_loader, max_epochs=100) pass
def training(rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() # Define output folder: config.output = "/tmp/output" model = idist.auto_model(config.model) optimizer = idist.auto_optim(config.optimizer) criterion = config.criterion train_set, val_set = config.train_set, config.val_set train_loader = idist.auto_dataloader(train_set, batch_size=config.train_batch_size) val_loader = idist.auto_dataloader(val_set, batch_size=config.val_batch_size) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval)) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) if rank == 0: tb_logger = TensorboardLogger(log_dir=config.output) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) model_checkpoint = ModelCheckpoint( config.output, n_saved=2, filename_prefix="best", score_name="accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) trainer.run(train_loader, max_epochs=config.num_epochs) if rank == 0: tb_logger.close()
def run(config): train_loader = get_instance(utils, 'dataloader', config, 'train') val_loader = get_instance(utils, 'dataloader', config, 'val') model = get_instance(models, 'arch', config) model = init_model(model, train_loader) model, device = ModelPrepper(model, config).out loss_fn = get_instance(nn, 'loss_fn', config) trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) writer = create_summary_writer(config, model, train_loader) batch_size = config['dataloader']['args']['batch_size'] if config['mode'] == 'eval' or config['resume']: model.load_state_dict(torch.load(config['ckpt_path'])) epoch_length = int(ceil(len(train_loader) / batch_size)) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=epoch_length, desc=desc.format(0)) def process_batch(engine, batch): inputs, outputs = func(batch) model.train() model.zero_grad() optimizer.zero_grad() preds = model(inputs) loss = loss_fn(preds, outputs.to(device)) a = list(model.parameters())[0].clone() loss.backward() optimizer.step() # check if training is happening b = list(model.parameters())[0].clone() try: assert not torch.allclose(a.data, b.data), 'Model not updating anymore' except AssertionError: plot_grad_flow(model.named_parameters()) return loss.item() def predict_on_batch(engine, batch): inputs, outputs = func(batch) model.eval() with torch.no_grad(): y_pred = model(inputs) return inputs, y_pred, outputs.to(device) trainer = Engine(process_batch) trainer.logger = setup_logger("trainer") evaluator = Engine(predict_on_batch) evaluator.logger = setup_logger("evaluator") if config['task'] == 'actionpred': Accuracy(output_transform=lambda x: (x[1], x[2])).attach( evaluator, 'val_acc') if config['task'] == 'gazepred': MeanSquaredError(output_transform=lambda x: (x[1], x[2])).attach( evaluator, 'val_MSE') RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') training_saver = ModelCheckpoint(config['checkpoint_dir'], filename_prefix='checkpoint_' + config['task'], n_saved=1, atomic=True, save_as_state_dict=True, create_dir=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, training_saver, {'model': model}) @trainer.on(Events.ITERATION_COMPLETED) def tb_log(engine): pbar.desc = desc.format(engine.state.output) pbar.update(1) writer.add_scalar('training/avg_loss', engine.state.metrics['loss'], engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def print_trainer_logs(engine): pbar.refresh() avg_loss = engine.state.metrics['loss'] tqdm.write('Trainer Results - Epoch {} - Avg loss: {:.2f} \n'.format( engine.state.epoch, avg_loss)) viz_param(writer=writer, model=model, global_step=engine.state.epoch) pbar.n = pbar.last_print_n = 0 @evaluator.on(Events.EPOCH_COMPLETED) def print_result(engine): try: print('Evaluator Results - Accuracy {} \n'.format( engine.state.metrics['val_acc'])) except KeyError: print('Evaluator Results - MSE {} \n'.format( engine.state.metrics['val_MSE'])) @evaluator.on(Events.ITERATION_COMPLETED) def viz_outputs(engine): visualize_outputs(writer=writer, state=engine.state, task=config['task']) if config['mode'] == 'train': trainer.run(train_loader, max_epochs=config['epochs'], epoch_length=epoch_length) pbar.close() evaluator.run(val_loader, max_epochs=1, epoch_length=int(ceil(len(val_loader) / batch_size))) writer.flush() writer.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) if config["with_trains"]: trains_logger = common.setup_trains_logging( trainer, optimizer, evaluators=evaluators, project_name="cifar10-ignite", task_name=Path(output_path).stem, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format(trainer.state.iteration)) trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close() if config["with_trains"]: trains_logger.close()
def _setup_a_logger_and_dump(name, message): logger = setup_logger(name) logger.info(message)
def run(args, seed): config.make_paths() torch.random.manual_seed(seed) train_loader, val_loader, shape = get_data_loaders( config.Training.batch_size, proportion=config.Training.proportion, test_batch_size=config.Training.batch_size * 2, ) n, d, t = shape model = models.ConvNet(d, seq_len=t) writer = tb.SummaryWriter(log_dir=config.TENSORBOARD) model.to(config.device) # Move model before creating optimizer optimizer = torch.optim.Adam(model.parameters()) criterion = nn.MSELoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=config.device) trainer.logger = setup_logger("trainer") checkpointer = ModelCheckpoint( config.MODEL, model.__class__.__name__, n_saved=2, create_dir=True, save_as_state_dict=True, ) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config.Training.save_every), checkpointer, {"model": model}, ) val_metrics = { "mse": Loss(criterion), "mae": MeanAbsoluteError(), "rmse": RootMeanSquaredError(), } evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=config.device) evaluator.logger = setup_logger("evaluator") ar_evaluator = create_ar_evaluator(model, metrics=val_metrics, device=config.device) ar_evaluator.logger = setup_logger("ar") @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.save_every)) def log_ar(engine): ar_evaluator.run(val_loader) y_pred, y = ar_evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/ar", fig, engine.state.epoch) plt.close() # desc = "ITERATION - loss: {:.2f}" # pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=config.Training.log_every)) def log_training_loss(engine): # pbar.desc = desc.format(engine.state.output) # pbar.update(log_interval) if args.verbose: grad_norm = torch.stack( [p.grad.norm() for p in model.parameters()]).sum() writer.add_scalar("train/grad_norm", grad_norm, engine.state.iteration) writer.add_scalar("train/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_training_results(engine): # pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"train/{k}", v, engine.state.epoch) # tqdm.write( # f"Training Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"eval/{k}", v, engine.state.epoch) # tqdm.write( # f"Validation Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) # pbar.n = pbar.last_print_n = 0 y_pred, y = evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/preds", fig, engine.state.epoch) plt.close() # @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) # def log_time(engine): # #tqdm.write( # # f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds" # #) if args.ckpt is not None: ckpt = torch.load(args.ckpt) ModelCheckpoint.load_objects({"model": model}, ckpt) try: trainer.run(train_loader, max_epochs=config.Training.max_epochs) except Exception as e: import traceback print(traceback.format_exc()) # pbar.close() writer.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-QAT-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path=config["output_path"], evaluator=evaluator, model=model, metric_name="Accuracy", n_saved=1, trainer=trainer, tag="test", ) trainer.run(train_loader, max_epochs=config["num_epochs"]) if rank == 0: tb_logger.close()
def training(local_rank, cfg): logger = setup_logger("FixMatch Training", distributed_rank=idist.get_rank()) if local_rank == 0: logger.info(cfg.pretty()) rank = idist.get_rank() manual_seed(cfg.seed + rank) device = idist.device() model, ema_model, optimizer, sup_criterion, lr_scheduler = utils.initialize(cfg) unsup_criterion = instantiate(cfg.solver.unsupervised_criterion) cta = get_default_cta() ( supervised_train_loader, test_loader, unsup_train_loader, cta_probe_loader, ) = utils.get_dataflow(cfg, cta=cta, with_unsup=True) def train_step(engine, batch): model.train() optimizer.zero_grad() x, y = batch["sup_batch"]["image"], batch["sup_batch"]["target"] if x.device != device: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) weak_x, strong_x = ( batch["unsup_batch"]["image"], batch["unsup_batch"]["strong_aug"], ) if weak_x.device != device: weak_x = weak_x.to(device, non_blocking=True) strong_x = strong_x.to(device, non_blocking=True) # according to TF code: single forward pass on concat data: [x, weak_x, strong_x] le = 2 * engine.state.mu_ratio + 1 # Why interleave: https://github.com/google-research/fixmatch/issues/20#issuecomment-613010277 # We need to interleave due to multiple-GPU batch norm issues. Let's say we have to GPUs, and our batch is # comprised of labeled (L) and unlabeled (U) images. Let's use a batch size of 2 for making easier visually # in my following example. # # - Without interleaving, we have a batch LLUUUUUU...U (there are 14 U). When the batch is split to be passed # to both GPUs, we'll have two batches LLUUUUUU and UUUUUUUU. Note that all labeled examples ended up in batch1 # sent to GPU1. The problem here is that batch norm will be computed per batch and the moments will lack # consistency between batches. # # - With interleaving, by contrast, the two batches will be LUUUUUUU and LUUUUUUU. As you can notice the # batches have the same distribution of labeled and unlabeled samples and will therefore have more consistent # moments. # x_cat = interleave(torch.cat([x, weak_x, strong_x], dim=0), le) y_pred_cat = model(x_cat) y_pred_cat = deinterleave(y_pred_cat, le) idx1 = len(x) idx2 = idx1 + len(weak_x) y_pred = y_pred_cat[:idx1, ...] y_weak_preds = y_pred_cat[idx1:idx2, ...] # logits_weak y_strong_preds = y_pred_cat[idx2:, ...] # logits_strong # supervised learning: sup_loss = sup_criterion(y_pred, y) # unsupervised learning: y_weak_probas = torch.softmax(y_weak_preds, dim=1).detach() y_pseudo = y_weak_probas.argmax(dim=1) max_y_weak_probas, _ = y_weak_probas.max(dim=1) unsup_loss_mask = ( max_y_weak_probas >= engine.state.confidence_threshold ).float() unsup_loss = ( unsup_criterion(y_strong_preds, y_pseudo) * unsup_loss_mask ).mean() total_loss = sup_loss + engine.state.lambda_u * unsup_loss total_loss.backward() optimizer.step() return { "total_loss": total_loss.item(), "sup_loss": sup_loss.item(), "unsup_loss": unsup_loss.item(), "mask": unsup_loss_mask.mean().item(), # this should not be averaged for DDP } output_names = ["total_loss", "sup_loss", "unsup_loss", "mask"] trainer = trainers.create_trainer( train_step, output_names=output_names, model=model, ema_model=ema_model, optimizer=optimizer, lr_scheduler=lr_scheduler, supervised_train_loader=supervised_train_loader, test_loader=test_loader, cfg=cfg, logger=logger, cta=cta, unsup_train_loader=unsup_train_loader, cta_probe_loader=cta_probe_loader, ) trainer.state.confidence_threshold = cfg.ssl.confidence_threshold trainer.state.lambda_u = cfg.ssl.lambda_u trainer.state.mu_ratio = cfg.ssl.mu_ratio distributed = idist.get_world_size() > 1 @trainer.on(Events.ITERATION_COMPLETED(every=cfg.ssl.cta_update_every)) def update_cta_rates(): batch = trainer.state.batch x, y = batch["cta_probe_batch"]["image"], batch["cta_probe_batch"]["target"] if x.device != device: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) policies = batch["cta_probe_batch"]["policy"] ema_model.eval() with torch.no_grad(): y_pred = ema_model(x) y_probas = torch.softmax(y_pred, dim=1) # (N, C) if distributed: for y_proba, t, policy in zip(y_probas, y, policies): error = y_proba error[t] -= 1 error = torch.abs(error).sum() cta.update_rates(policy, 1.0 - 0.5 * error.item()) else: error_per_op = [] for y_proba, t, policy in zip(y_probas, y, policies): error = y_proba error[t] -= 1 error = torch.abs(error).sum() for k, bins in policy: error_per_op.append(pack_as_tensor(k, bins, error)) error_per_op = torch.stack(error_per_op) # all gather tensor_list = idist.all_gather(error_per_op) # update cta rates for t in tensor_list: k, bins, error = unpack_from_tensor(t) cta.update_rates([(k, bins),], 1.0 - 0.5 * error) epoch_length = cfg.solver.epoch_length num_epochs = cfg.solver.num_epochs if not cfg.debug else 2 try: trainer.run( supervised_train_loader, epoch_length=epoch_length, max_epochs=num_epochs ) except Exception as e: import traceback print(traceback.format_exc())
def __init__(self, output_transform: Callable = lambda x: x): self.logger = setup_logger(__name__ + "." + self.__class__.__name__) self.logger.addHandler(logging.StreamHandler()) self._output_transform = output_transform
def auto_model(model: nn.Module) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1. - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) In addition with NVidia/Apex, it can be used in the following way: .. code-block:: python import ignite.distribted as idist model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = idist.auto_model(model) Args: model (torch.nn.Module): model to adapt. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel .. _torch DataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel """ logger = setup_logger(__name__ + ".auto_model") # Put model's parameters to device if its parameters are not on the device device = idist.device() if not all([p.device == device for p in model.parameters()]): model.to(device) # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() if idist.has_native_dist_support and bnd == idist_native.NCCL: lrank = idist.get_local_rank() logger.info( "Apply torch DistributedDataParallel on model, device id: {}". format(lrank)) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[ lrank, ]) elif idist.has_native_dist_support and bnd == idist_native.GLOO: logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd logger.info( "Broadcast the initial variable states from rank 0 to all other processes" ) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model) return model
127:128]) loss.backward() optimizer.step() return loss.item() # Create Trainer or Evaluators trainer = Engine(backprop_step) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) trainer.logger = setup_logger("Trainer") train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator.logger = setup_logger("Validation Evaluator") # Tensorboard Logger setup below based on pytorch ignite example # https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_tensorboard_logger.py @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): """Callback to compute metrics on the train and validation data.""" train_evaluator.run(finetuning_loader) validation_evaluator.run(test_loader) scheduler.step(validation_evaluator.state.metrics['loss']) def score_function(engine):
def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDeviceLoader"]: """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we create a dataloader with provided kwargs while applying the following updates: - batch size is scaled by world size: ``batch_size / world_size`` if larger or equal world size. - number of workers is scaled by number of local processes: ``num_workers / nprocs`` if larger or equal world size. - if no sampler provided by user, a `torch DistributedSampler`_ is setup. - if a `torch DistributedSampler`_ is provided by user, it is used without wrapping it. - if another sampler is provided, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`. - if the default device is 'cuda', `pin_memory` is automatically set to `True`. .. warning:: Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch sampler is compatible with distributed configuration. Args: dataset: input torch dataset. If input dataset is `torch IterableDataset`_ then dataloader will be created without any distributed sampling. Please, make sure that the dataset itself produces different data on different ranks. kwargs: keyword arguments for `torch DataLoader`_. Returns: `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices Examples: .. code-block:: python import ignite.distribted as idist train_loader = idist.auto_dataloader( train_dataset, batch_size=32, num_workers=4, shuffle=True, pin_memory="cuda" in idist.device().type, drop_last=True, ) .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178 .. _torch DistributedSampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler .. _torch IterableDataset: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset """ rank = idist.get_rank() world_size = idist.get_world_size() logger = setup_logger(__name__ + ".auto_dataloader") if world_size > 1: if "batch_size" in kwargs and kwargs["batch_size"] >= world_size: kwargs["batch_size"] //= world_size nproc = idist.get_nproc_per_node() if "num_workers" in kwargs and kwargs["num_workers"] >= nproc: kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc if "batch_sampler" not in kwargs: if isinstance(dataset, IterableDataset): logger.info( "Found iterable dataset, dataloader will be created without any distributed sampling. " "Please, make sure that the dataset itself produces different data on different ranks." ) else: sampler: Optional[Union[DistributedProxySampler, DistributedSampler, Sampler]] sampler = kwargs.get("sampler", None) if isinstance(sampler, DistributedSampler): if sampler.rank != rank: warnings.warn(f"Found distributed sampler with rank={sampler.rank}, but process rank is {rank}") if sampler.num_replicas != world_size: warnings.warn( f"Found distributed sampler with num_replicas={sampler.num_replicas}, " f"but world size is {world_size}" ) elif sampler is None: # removes "shuffle" from kwargs if sampler is used shuffle = kwargs.pop("shuffle", True) sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=shuffle) else: sampler = DistributedProxySampler(sampler, num_replicas=world_size, rank=rank) kwargs["sampler"] = sampler else: warnings.warn( "Found batch_sampler in provided kwargs. Please, make sure that it is compatible " "with distributed configuration" ) if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and kwargs.get("pin_memory", False): # TODO: How about XLA GPU ? warnings.warn( "Found incompatible options: xla support and pin_memory args equal True. " "Argument `pin_memory=False` will be used to construct data loader." ) kwargs["pin_memory"] = False else: kwargs["pin_memory"] = kwargs.get("pin_memory", "cuda" in idist.device().type) logger.info(f"Use data loader kwargs for dataset '{repr(dataset)[:20].strip()}': \n\t{kwargs}") dataloader = DataLoader(dataset, **kwargs) if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and world_size > 1: logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA") mp_device_loader_cls = _MpDeviceLoader try: from torch_xla.distributed.parallel_loader import MpDeviceLoader mp_device_loader_cls = MpDeviceLoader except ImportError: pass mp_dataloader = mp_device_loader_cls(dataloader, idist.device()) mp_dataloader.sampler = dataloader.sampler # type: ignore[attr-defined] return mp_dataloader return dataloader
def test_dist_setup_logger(): logger = setup_logger("trainer", level=logging.CRITICAL, distributed_rank=1) assert logger.level != logging.CRITICAL
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) npt_logger = NeptuneLogger( api_token="ANONYMOUS", project_name="shared/pytorch-ignite-integration", name="ignite-mnist-example", params={ "train_batch_size": train_batch_size, "val_batch_size": val_batch_size, "epochs": epochs, "lr": lr, "momentum": momentum, }, ) npt_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: npt_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) npt_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) npt_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) npt_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] handler = Checkpoint( {"model": model}, NeptuneSaver(npt_logger), n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, handler) # kick everything off trainer.run(train_loader, max_epochs=epochs) npt_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) vd_logger = VisdomLogger(env="mnist_training") vd_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: vd_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) vd_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) vd_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) vd_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) vd_logger.close()
def run(epochs, lr, momentum, log_interval): device = "cuda" if torch.cuda.is_available() else "cpu" net = Net().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(net, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), "recall": Recall() } evaluator = create_supervised_evaluator(net, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") # Attach handler to plot trainer's loss every 100 iterations tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=params.get('loss_report')), tag="training", output_transform=lambda loss: {"loss": loss}, ) # Attach handler to dump evaluator's metrics every epoch completed for tag, evaluator in [("training", trainer), ("validation", evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names="all", global_step_transform=global_step_from_engine(trainer), ) # Attach function to build debug images and report every epoch end tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler, event_name=Events.EPOCH_COMPLETED(once=1), ) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(trainloader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(trainloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(testloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(): tqdm.write("{} took {} seconds".format( trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name])) trainer.run(trainloader, max_epochs=epochs) pbar.close() PATH = './cifar_net.pth' torch.save(net.state_dict(), PATH) print('Finished Training') print('Task ID number is: {}'.format(task.id))
def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1. - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) In addition with NVidia/Apex, it can be used in the following way: .. code-block:: python import ignite.distribted as idist model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = idist.auto_model(model) Args: model: model to adapt. sync_bn: if True, applies `torch convert_sync_batchnorm`_ to the model for native torch distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be applied before calling ``amp.initialize``. kwargs: kwargs to model's wrapping class: `torch DistributedDataParallel`_ or `torch DataParallel`_ if applicable. Please, make sure to use acceptable kwargs for given backend. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel. DistributedDataParallel.html .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html# torch.nn.SyncBatchNorm.convert_sync_batchnorm .. versionchanged:: 0.4.2 - Added Horovod distributed framework. - Added ``sync_bn`` argument. .. versionchanged:: 0.4.3 Added kwargs to ``idist.auto_model``. """ logger = setup_logger(__name__ + ".auto_model") # Put model's parameters to device if its parameters are not on the device device = idist.device() if not all([p.device == device for p in model.parameters()]): model.to(device) # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() if idist.has_native_dist_support and bnd in (idist_native.NCCL, idist_native.GLOO, idist_native.MPI): if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if torch.cuda.is_available(): if "device_ids" in kwargs: raise ValueError( f"Argument kwargs should not contain 'device_ids', but got {kwargs}" ) lrank = idist.get_local_rank() logger.info( f"Apply torch DistributedDataParallel on model, device id: {lrank}" ) kwargs["device_ids"] = [ lrank, ] else: logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model, **kwargs) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd logger.info( "Broadcast the initial variable states from rank 0 to all other processes" ) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model, **kwargs) return model
def setup(self): self._init_distribution() self.trainer = Engine(self.train_step) self.trainer.logger = setup_logger(name="trainer", distributed_rank=self.local_rank) self.log_basic_info(self.trainer.logger) self.load_trainer_from_checkpoint() if self.scheduler: self.scheduler_event = self.trainer.add_event_handler(Events.ITERATION_STARTED, self.scheduler) else: self.scheduler_event = None self.attach_metrics(self.trainer, self.train_metrics) if idist.get_world_size() >1: def set_epoch(engine): self.train_loader.sampler.set_epoch(engine.state.epoch) self.trainer.add_event_handler(Events.EPOCH_STARTED, set_epoch) common.setup_common_training_handlers( self.trainer, train_sampler=self.train_loader.sampler, to_save=None, save_every_iters=0, output_path= None, lr_scheduler= None, output_names= None, with_pbars=self.hparams.add_pbar, clear_cuda_cache=True, stop_on_nan=False ) self.evaluator = Engine(self.eval_step) self.evaluator.logger = setup_logger("evaluator", distributed_rank=self.local_rank) if self.hparams.add_pbar: ProgressBar(persist=False).attach(self.evaluator) def complete_clear(engine): engine.state.batch = None engine.state.output = None import gc gc.collect() self.trainer.add_event_handler(Events.EPOCH_COMPLETED, complete_clear) self.validation_handler_event = self.trainer.add_event_handler(Events.EPOCH_COMPLETED(every=self.hparams.eval_every), self.validate(self.valid_loader)) self.evaluator.add_event_handler(Events.EPOCH_COMPLETED, complete_clear) train_handler_params = { "model": self.model, "optimizer": self.optimizer, "scheduler": self.scheduler } eval_handler_params = { "model": self.model, "optimizer": self.optimizer, "scheduler": self.scheduler } to_save = { "model": self.model, "trainer": self.trainer, "optimizer": self.optimizer } if self.scheduler is not None: to_save["scheduler"] = self.scheduler if USE_AMP: to_save["amp"] = amp self.attach_metrics(self.evaluator, self.validation_metrics) self.setup_checkpoint_saver(to_save) if self.rank == 0: self._init_logger() if self.logger: self.logger._init_logger(self.trainer, self.evaluator) self.logger._add_train_events(**train_handler_params) self.logger._add_eval_events(**eval_handler_params)