def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info(f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="ImageNet-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_imagenet_dataloader(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_supervised_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format( trainer.state.iteration)) trainer.terminate() @trainer.on(Events.ITERATION_COMPLETED(every=20)) def print_acc(engine): if rank == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\ .format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.saved_batch_loss )) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def training(local_rank, config, logger, with_clearml): rank = idist.get_rank() manual_seed(config.seed + local_rank) train_loader = config.train_loader val_loader = config.val_loader train_eval_loader = config.train_eval_loader model, optimizer, criterion = utils.initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train") val_interval = config.get("val_interval", 1) # Run validation on every val_interval epoch, in the end of the training # and in the begining if config.start_by_validation is True event = Events.EPOCH_COMPLETED(every=val_interval) if config.num_epochs % val_interval != 0: event |= Events.COMPLETED if config.get("start_by_validation", False): event |= Events.STARTED @trainer.on(event) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) score_metric_name = "mIoU_bg" if "es_patience" in config: common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 2 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=2, trainer=trainer, tag="val", ) # Setup Tensorboard logger if rank == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}, ) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 # Image denormalization function to plot predictions with images mean = config.get("mean", (0.485, 0.456, 0.406)) std = config.get("std", (0.229, 0.224, 0.225)) img_denormalize = partial(data.denormalize, mean=mean, std=std) tb_logger.attach( evaluator, log_handler=vis.predictions_gt_images_handler( img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation" ), event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), ) # Log confusion matrix to ClearML: if with_clearml: trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close()
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) val_interval = getattr(config, "val_interval", 1) @trainer.on(Events.EPOCH_COMPLETED(every=val_interval)) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if config.num_epochs % val_interval != 0: trainer.add_event_handler(Events.COMPLETED, run_validation) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}, ) if not exp_tracking.has_clearml: exp_tracking_logger = exp_tracking.setup_logging( trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator} ) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation" ), event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), ) # Log confusion matrix to ClearML: if exp_tracking.has_clearml: @trainer.on(Events.COMPLETED) def compute_and_log_cm(): cm = cm_metric.compute() # CM: values are normalized such that diagonal values represent class recalls cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy() if idist.get_rank() == 0: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task clearml_logger = Task.current_task().get_logger() clearml_logger.report_confusion_matrix( title="Final Confusion Matrix", series="cm-preds-gt", matrix=cm, iteration=trainer.state.iteration, xlabels=VOCSegmentationOpencv.target_names, ylabels=VOCSegmentationOpencv.target_names, ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() if not exp_tracking.has_clearml: exp_tracking_logger.close()
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]: rank = idist.get_rank() if config.seed is not None: manual_seed(config.seed + rank) logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank) log_basic_info(logger, config) if rank == 0: prepare_output_directory(config) logger.info("Output path: {}".format(config.output_path)) weak_label_mgr = get_weak_label_manager(config) # Setup dataflow, model, optimizer, criterion data_loaders = get_dataflow(config, weak_label_mgr) train_loader = data_loaders["train"] config.num_iters_per_epoch = len(train_loader) model, optimizer, criterion = initialize(config, weak_label_mgr) metrics = get_metrics(criterion) trainer, evaluators = create_trainer_and_evaluators( model, optimizer, criterion, data_loaders, metrics, config, logger ) if rank == 0: tb_logger = common.setup_tb_logging( config.output_path, trainer, optimizer, evaluators=evaluators ) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluators["val"], models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) state_at_best_val = StateAtBestVal( score_function=lambda: evaluators["val"].state.metrics["accuracy"], state_function=lambda: dict( {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()}, **{ "test_" + key: val for key, val in evaluators["test"].state.metrics.items() }, epoch=trainer.state.epoch ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val) try: trainer.run(train_loader, max_epochs=config.num_epochs) except Exception: import traceback print(traceback.format_exc()) else: assert state_at_best_val.best_state is not None tb_logger.writer.add_hparams( # type: ignore get_hparams(config), {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()}, ) finally: if rank == 0: tb_logger.close() # type: ignore return evaluators["val"].state.metrics
def training(local_rank, config, logger=None): # # if not getattr(config, "use_fp16", True): # raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) val_interval = getattr(config, "val_interval", 1) @trainer.on(Events.EPOCH_COMPLETED(every=val_interval)) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if config.num_epochs % val_interval != 0: trainer.add_event_handler(Events.COMPLETED, run_validation) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( event_filter=custom_event_filter), ) # Log confusion matrix to Trains: trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()