def evaluate(model: AdapNet, dl: DataLoader, mode, batch_size=2): """ Evaluates the model, uses IoU as the metric :param model: The model to evaluate :param dl: The DataLoader of the model :param mode: The evaluations mode, one of "test" or "validation" :param batch_size: The batch size for the evaluation :return: """ model.eval() if mode == "test": set = dl.test_set else: set = dl.validation_set reps = len(set) // batch_size cm = ConfusionMatrix(dl.num_labels) iou_cur = IoU(cm) with torch.no_grad(): for _ in range(reps): m1, m2, gt = dl.sample_batch(batch_size, mode=mode) _, _, res = model(m1, m2) res = torch.softmax(res, dim=1) cm.update((res, gt)) iou_score = iou_cur.compute() print("Evaluation of " + mode + " set") print("mIoU: " + str(iou_score.mean().item())) print("IoU: " + str(iou_score)) return iou_score
def create_iou_metric(self, cm: ConfusionMatrix): """ Computes the Sørensen–Dice Coefficient (https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient) Args: cm (:obj:`ignite.metrics.ConfusionMatrix`): A confusion matrix representing the classification of data. Returns: array or float: The Sørensen–Dice Coefficient for each class or the mean Sørensen–Dice Coefficient. """ metric = IoU(cm, ignore_index=self._ignore_index) if self._reduction == "mean": metric = metric.mean() return metric
def eval_model(model, val_loader, device='cpu', num_classes=21): def evaluate_function(engine, batch): model.eval() with torch.no_grad(): img, mask = batch img = img.to(device) mask = mask.to(device) mask_pred = model(img) try: mask_pred = mask_pred['out'] except: mask_pred = mask_pred return mask_pred, mask val_evaluator = Engine(evaluate_function) cm = ConfusionMatrix(num_classes=num_classes) mIoU(cm).attach(val_evaluator, 'mean IoU') IoU(cm).attach(val_evaluator, 'IoU') Accuracy().attach(val_evaluator, "accuracy") Loss(loss_fn=nn.CrossEntropyLoss())\ .attach(val_evaluator, "CE Loss") state = val_evaluator.run(val_loader) #print("mIoU :",state.metrics['mean IoU']) #print("Accuracy :",state.metrics['accuracy']) #print("CE Loss :",state.metrics['CE Loss']) return state
def test_iou_wrong_input(): with pytest.raises(TypeError, match="Argument cm should be instance of ConfusionMatrix"): IoU(None) cm = ConfusionMatrix(num_classes=10) with pytest.raises(ValueError, match="ignore_index should be non-negative integer"): IoU(cm, ignore_index=-1) with pytest.raises(ValueError, match="ignore_index should be non-negative integer"): IoU(cm, ignore_index="a") with pytest.raises(ValueError, match="ignore_index should be non-negative integer"): IoU(cm, ignore_index=10) with pytest.raises(ValueError, match="ignore_index should be non-negative integer"): IoU(cm, ignore_index=11)
def evaluation(local_rank, config, logger, with_clearml): rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) data_loader = config.data_loader model = config.model.to(device) # Load weights: state_dict = get_model_weights(config, logger, with_clearml) model.load_state_dict(state_dict) # Adapt model to dist config model = idist.auto_model(model) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") # Setup Tensorboard logger if rank == 0: tb_logger = common.TensorboardLogger( log_dir=config.output_path.as_posix()) tb_logger.attach_output_handler( evaluator, event_name=Events.COMPLETED, tag="validation", metric_names="all", ) # Log confusion matrix to ClearML: if with_clearml: evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, evaluator.state.iteration) state = evaluator.run(data_loader) utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation", state.metrics) if idist.get_rank() == 0: tb_logger.close()
def run(args): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(colored("Using device: ", "white") + colored(device, "green")) print(colored("Initializing test dataset...", color="white")) _, _, test_dataset = get_datasets(args.data) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) model_factory = { 'fcn-resnet50': lambda: torchvision.models.segmentation.fcn_resnet50(num_classes=NUM_CLASSES, pretrained=False), 'fcn-resnet101': lambda: torchvision.models.segmentation.fcn_resnet101(num_classes=NUM_CLASSES, pretrained=False), 'deeplab-resnet50': lambda: torchvision.models.segmentation.deeplabv3_resnet50(num_classes=NUM_CLASSES, pretrained=False), 'deeplab-resnet101': lambda: torchvision.models.segmentation.deeplabv3_resnet101(num_classes=NUM_CLASSES, pretrained=False) } model = model_factory[args.model]() model.load_state_dict(torch.load(args.weights)) model.to(device) cm_metric = ConfusionMatrix(num_classes=NUM_CLASSES, output_transform=output_transform_seg) metrics = {'dice': MetricsLambda(lambda x: torch.mean(x).item(), DiceCoefficient(cm_metric)), 'iou': MetricsLambda(lambda x: torch.mean(x).item(), IoU(cm_metric)), 'dice_background': MetricsLambda(lambda x: x[0].item(), DiceCoefficient(cm_metric)), 'dice_head': MetricsLambda(lambda x: x[1].item(), DiceCoefficient(cm_metric)), 'dice_mid': MetricsLambda(lambda x: x[2].item(), DiceCoefficient(cm_metric)), 'dice_tail': MetricsLambda(lambda x: x[3].item(), DiceCoefficient(cm_metric)), 'iou_background': MetricsLambda(lambda x: x[0].item(), IoU(cm_metric)), 'iou_head': MetricsLambda(lambda x: x[1].item(), IoU(cm_metric)), 'iou_mid': MetricsLambda(lambda x: x[2].item(), IoU(cm_metric)), 'iou_tail': MetricsLambda(lambda x: x[3].item(), IoU(cm_metric)) } print(colored("Evaluating...\n", color="white")) test_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, prepare_batch=prepare_batch) @test_evaluator.on(Events.COMPLETED) def log_training_loss(engine): for k, v in engine.state.metrics.items(): print(f"{k}: {v:.4f}") test_evaluator.run(test_loader)
def make_engine(process_function): evaluator = Engine(process_function) cm = ConfusionMatrix(num_classes=getattr( datasets, CONFIG["dataset"]["name"]).N_LABELS, output_transform=output_transform) IoU(cm, ignore_index=0).attach(evaluator, 'IoU') mIoU(cm, ignore_index=0).attach(evaluator, 'mIoU') Accuracy(output_transform=output_transform).attach(evaluator, 'Accuracy') cmAccuracy(cm, ignore_index=0).attach(evaluator, 'ClasswiseAccuracy') return evaluator
def _test(average=None): y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) true_res = [0, 0, 0] for index in range(3): bin_y_true = y_true == index bin_y_pred = y_pred == index intersection = bin_y_true & bin_y_pred union = bin_y_true | bin_y_pred true_res[index] = intersection.sum() / union.sum() cm = ConfusionMatrix(num_classes=3, average=average) iou_metric = IoU(cm) # Update metric output = (th_y_logits, th_y_true) cm.update(output) res = iou_metric.compute().numpy() assert np.all(res == true_res) for ignore_index in range(3): cm = ConfusionMatrix(num_classes=3) iou_metric = IoU(cm, ignore_index=ignore_index) # Update metric output = (th_y_logits, th_y_true) cm.update(output) res = iou_metric.compute().numpy() true_res_ = true_res[:ignore_index] + true_res[ignore_index + 1 :] assert np.all(res == true_res_), f"{ignore_index}: {res} vs {true_res_}"
def test_iou(): def _test(average=None): y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) true_res = [0, 0, 0] for index in range(3): bin_y_true = y_true == index bin_y_pred = y_pred == index intersection = bin_y_true & bin_y_pred union = bin_y_true | bin_y_pred true_res[index] = intersection.sum() / union.sum() cm = ConfusionMatrix(num_classes=3, average=average) iou_metric = IoU(cm) # Update metric output = (th_y_logits, th_y_true) cm.update(output) res = iou_metric.compute().numpy() assert np.all(res == true_res) for ignore_index in range(3): cm = ConfusionMatrix(num_classes=3) iou_metric = IoU(cm, ignore_index=ignore_index) # Update metric output = (th_y_logits, th_y_true) cm.update(output) res = iou_metric.compute().numpy() true_res_ = true_res[:ignore_index] + true_res[ignore_index + 1:] assert np.all(res == true_res_), "{}: {} vs {}".format( ignore_index, res, true_res_) _test() _test(average="samples") with pytest.raises( ValueError, match=r"ConfusionMatrix should have average attribute either"): cm = ConfusionMatrix(num_classes=3, average="precision") IoU(cm)
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr( config, "fp16_opt_level", "O2"), num_losses=1) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output output_names = getattr(config, "output_names", [ 'supervised batch loss', ]) trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ 'model': model, 'optimizer': optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict(model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: { 'y_pred': model_output_transform(y_pred), 'y': y }) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.setup_mlflow_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def training(local_rank, config, logger, with_clearml): rank = idist.get_rank() manual_seed(config.seed + local_rank) train_loader = config.train_loader val_loader = config.val_loader train_eval_loader = config.train_eval_loader model, optimizer, criterion = utils.initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train") val_interval = config.get("val_interval", 1) # Run validation on every val_interval epoch, in the end of the training # and in the begining if config.start_by_validation is True event = Events.EPOCH_COMPLETED(every=val_interval) if config.num_epochs % val_interval != 0: event |= Events.COMPLETED if config.get("start_by_validation", False): event |= Events.STARTED @trainer.on(event) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) score_metric_name = "mIoU_bg" if "es_patience" in config: common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 2 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=2, trainer=trainer, tag="val", ) # Setup Tensorboard logger if rank == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}, ) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 # Image denormalization function to plot predictions with images mean = config.get("mean", (0.485, 0.456, 0.406)) std = config.get("std", (0.229, 0.224, 0.225)) img_denormalize = partial(data.denormalize, mean=mean, std=std) tb_logger.attach( evaluator, log_handler=vis.predictions_gt_images_handler( img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation", ), event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), ) # Log confusion matrix to ClearML: if with_clearml: trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close()
[3, 2, 1, 2], [1, 1, 0, 0]]) mask = torch.LongTensor([[1, 1, 0, 1], [1, 2, 1, 0], [3, 1, 2, 2], [3, 1, 0, 0]]) pred = make_one_hot(pred, 4) return pred[0], mask model = nn.Sequential() criterion =nn.BCELoss() from torch.optim import Adam device = torch.device('cpu') cm = ConfusionMatrix(num_classes=4) miou=mIoU(cm, ignore_index=0) iou = IoU(cm,ignore_index=0) metric = { 'mIOU':miou, 'IOU':iou } evaluator =create_supervised_evaluator (model,metric,device=device) data_loader = DataLoader(Data()) evaluator.run(data_loader) print(evaluator.state.output) state = evaluator.run(data_loader) print(state.metrics['mIOU']) print(state.metrics['IOU'])
def run(args): train_loader, val_loader = get_data_loaders(args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers) if args.seed is not None: torch.manual_seed(args.seed) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') num_classes = KITTI.num_classes() model = LiLaNet(num_classes) device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) criterion = nn.CrossEntropyLoss(weight=KITTI.class_weights()).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (Epoch {})".format( args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) def _prepare_batch(batch, non_blocking=True): distance, reflectivity, target = batch return (convert_tensor(distance, device=device, non_blocking=non_blocking), convert_tensor(reflectivity, device=device, non_blocking=non_blocking), convert_tensor(target, device=device, non_blocking=non_blocking)) def _update(engine, batch): model.train() optimizer.zero_grad() distance, reflectivity, target = _prepare_batch(batch) pred = model(distance, reflectivity) loss = criterion(pred, target) loss.backward() optimizer.step() return loss.item() trainer = Engine(_update) # attach running average metrics RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) def _inference(engine, batch): model.eval() with torch.no_grad(): distance, reflectivity, target = _prepare_batch(batch) pred = model(distance, reflectivity) return pred, target evaluator = Engine(_inference) cm = ConfusionMatrix(num_classes) IoU(cm, ignore_index=0).attach(evaluator, 'IoU') Loss(criterion).attach(evaluator, 'loss') pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): if trainer.state is not None: return trainer.state.iteration else: return 1 tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler( tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @trainer.on(Events.STARTED) def initialize(engine): engine.state.exception_raised = False if args.resume: engine.state.epoch = args.start_epoch @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): epoch = trainer.state.epoch if trainer.state is not None else 1 iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() name = 'epoch{}_mIoU={:.1f}.pth'.format(epoch, mean_iou) file = { 'model': model.state_dict(), 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'args': args } save(file, args.output_dir, 'checkpoint_{}'.format(name)) save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) @trainer.on(Events.EPOCH_COMPLETED) def run_validation(engine): pbar.log_message("Start Validation - Epoch: [{}/{}]".format( engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] * 100.0 mean_iou = iou.mean() iou_text = ', '.join([ '{}: {:.1f}'.format(KITTI.classes[i + 1].name, v) for i, v in enumerate(iou.tolist()) ]) pbar.log_message( "Validation results - Epoch: [{}/{}]: Loss: {:.2e}\n IoU: {}\n mIoU: {:.1f}" .format(engine.state.epoch, engine.state.max_epochs, loss, iou_text, mean_iou)) @trainer.on(Events.EXCEPTION_RAISED) def handle_exception(engine, e): engine.state.exception_raised = True if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1): engine.terminate() warnings.warn("KeyboardInterrupt caught. Exiting gracefully.") name = 'epoch{}_exception.pth'.format(trainer.state.epoch) file = { 'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict() } save(file, args.output_dir, 'checkpoint_{}'.format(name)) save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) else: raise e if args.eval_on_start: print("Start validation") evaluator.run(val_loader, max_epochs=1) print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()
def inference(config, local_rank, with_pbar_on_iters=True): set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True # Load model and weights model_weights_filepath = Path( get_artifact_path(config.run_uuid, config.weights_filename)) assert model_weights_filepath.exists(), \ "Model weights file '{}' is not found".format(model_weights_filepath.as_posix()) model = config.model.to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) if hasattr(config, "custom_weights_loading"): config.custom_weights_loading(model, model_weights_filepath) else: state_dict = torch.load(model_weights_filepath) if not all([k.startswith("module.") for k in state_dict]): state_dict = {f"module.{k}": v for k, v in state_dict.items()} model.load_state_dict(state_dict) model.eval() prepare_batch = config.prepare_batch non_blocking = getattr(config, "non_blocking", True) model_output_transform = getattr(config, "model_output_transform", lambda x: x) tta_transforms = getattr(config, "tta_transforms", None) def eval_update_function(engine, batch): with torch.no_grad(): x, y, meta = prepare_batch(batch, device=device, non_blocking=non_blocking) if tta_transforms is not None: y_preds = [] for t in tta_transforms: t_x = t.augment_image(x) t_y_pred = model(t_x) t_y_pred = model_output_transform(t_y_pred) y_pred = t.deaugment_mask(t_y_pred) y_preds.append(y_pred) y_preds = torch.stack(y_preds, dim=0) y_pred = torch.mean(y_preds, dim=0) else: y_pred = model(x) y_pred = model_output_transform(y_pred) return {"y_pred": y_pred, "y": y, "meta": meta} evaluator = Engine(eval_update_function) has_targets = getattr(config, "has_targets", False) if has_targets: def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "metrics") and isinstance(config.metrics, dict): val_metrics.update(config.metrics) for name, metric in val_metrics.items(): metric.attach(evaluator, name) if dist.get_rank() == 0: # Log val metrics: mlflow_logger = MLflowLogger() mlflow_logger.attach(evaluator, log_handler=OutputHandler( tag="validation", metric_names=list(val_metrics.keys())), event_name=Events.EPOCH_COMPLETED) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=True, desc="Inference").attach(evaluator) if dist.get_rank() == 0: do_save_raw_predictions = getattr(config, "do_save_raw_predictions", True) do_save_overlayed_predictions = getattr( config, "do_save_overlayed_predictions", True) if not has_targets: assert do_save_raw_predictions or do_save_overlayed_predictions, \ "If no targets, either do_save_overlayed_predictions or do_save_raw_predictions should be " \ "defined in the config and has value equal True" # Save predictions if do_save_raw_predictions: raw_preds_path = config.output_path / "raw" raw_preds_path.mkdir(parents=True) evaluator.add_event_handler(Events.ITERATION_COMPLETED, save_raw_predictions_with_geoinfo, raw_preds_path) if do_save_overlayed_predictions: overlayed_preds_path = config.output_path / "overlay" overlayed_preds_path.mkdir(parents=True) evaluator.add_event_handler( Events.ITERATION_COMPLETED, save_overlayed_predictions, overlayed_preds_path, img_denormalize_fn=config.img_denormalize, palette=default_palette) evaluator.add_event_handler(Events.EXCEPTION_RAISED, report_exception) # Run evaluation evaluator.run(config.data_loader)
def run(args): train_loader, val_loader = get_data_loaders(args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers) if args.seed is not None: torch.manual_seed(args.seed) num_classes = CityscapesDataset.num_classes() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = GoogLeNetFCN(num_classes) model.init_from_googlenet() device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=255) optimizer = optim.SGD([{'params': [p for p, name in model.named_parameters() if name[-4:] != 'bias'], 'lr': args.lr, 'weight_decay': 5e-4}, {'params': [p for p, name in model.named_parameters() if name[-4:] == 'bias'], 'lr': args.lr * 2}], momentum=args.momentum, lr=args.lr) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) trainer = create_supervised_trainer(model, optimizer, criterion, device, non_blocking=True) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) cm = ConfusionMatrix(num_classes) evaluator = create_supervised_evaluator(model, metrics={'loss': Loss(criterion), 'IoU': IoU(cm, ignore_index=0)}, device=device, non_blocking=True) pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): return trainer.state.iteration tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou) file = {'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict(), 'args': args} torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name))) torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name))) @trainer.on(Events.STARTED) def initialize(engine): if args.resume: engine.state.epoch = args.start_epoch @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.log_message('Start Validation - Epoch: [{}/{}]'.format(engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] mean_iou = iou.mean() pbar.log_message('Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}' .format(engine.state.epoch, engine.state.max_epochs, loss, mean_iou * 100.0)) @trainer.on(Events.EXCEPTION_RAISED) def handle_exception(engine, e): engine.state.exception_raised = True if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1): engine.terminate() warnings.warn("KeyboardInterrupt caught. Exiting gracefully.") name = 'epoch{}_exception.pth'.format(trainer.state.epoch) file = {'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict()} torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name))) torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name))) else: raise e print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" unsup_train_loader = config.unsup_train_loader unsup_train_sampler = getattr(unsup_train_loader, "sampler", None) assert unsup_train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(unsup_train_loader)) assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \ "Unsupervised train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) unsup_criterion = config.unsup_criterion.to(device) unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def cycle(seq): while True: for i in seq: yield i unsup_train_loader_iter = cycle(unsup_train_loader) def supervised_loss(batch): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) return loss def unsupervised_loss(x): with torch.no_grad(): y_pred_orig = model(x) # Data augmentation: geom only k = random.randint(1, 3) x_aug = torch.rot90(x, k=k, dims=(2, 3)) y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3)) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(2, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(3, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long() y_pred_aug = model(x_aug.detach()) loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach()) return loss def train_update_function(engine, batch): model.train() loss = supervised_loss(batch) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} # Difference with original UDA # Apply separately grads from supervised/unsupervised parts with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = next(unsup_train_loader_iter) unsup_x = unsup_batch['image'] unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking) for _ in range(unsup_batch_num_repetitions): unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x) assert isinstance(unsup_loss, torch.Tensor) output['unsupervised batch loss'] = unsup_loss.item() with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = None unsup_x = None total_loss = loss + unsup_loss output['total batch loss'] = total_loss.item() return output output_names = getattr(config, "output_names", ['supervised batch loss', 'unsupervised batch loss', 'total batch loss']) trainer = Engine(train_update_function) @trainer.on(Events.STARTED) def init(engine): if hasattr(config, "unsup_lambda_min"): engine.state.unsup_lambda = config.unsup_lambda_min else: engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001) @trainer.on(Events.ITERATION_COMPLETED) def update_unsup_params(engine): engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001) if hasattr(config, "unsup_lambda_max"): m = config.unsup_lambda_max engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={'model': model, 'optimizer': optimizer}, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1 ) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict( model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y} ) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.setup_mlflow_logging(trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log unsup_lambda @trainer.on(Events.ITERATION_COMPLETED(every=100)) def tblog_unsupervised_lambda(engine): tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration) mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration) # Log train/val predictions: tb_logger.attach(evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def training(local_rank, config, logger=None): # # if not getattr(config, "use_fp16", True): # raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) val_interval = getattr(config, "val_interval", 1) @trainer.on(Events.EPOCH_COMPLETED(every=val_interval)) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if config.num_epochs % val_interval != 0: trainer.add_event_handler(Events.COMPLETED, run_validation) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( event_filter=custom_event_filter), ) # Log confusion matrix to Trains: trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) device = config.device train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) @trainer.on( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)) | Events.COMPLETED) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( config.output_path.as_posix(), evaluator, model=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = exp_tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2), ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) val_interval = getattr(config, "val_interval", 1) @trainer.on(Events.EPOCH_COMPLETED(every=val_interval)) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if config.num_epochs % val_interval != 0: trainer.add_event_handler(Events.COMPLETED, run_validation) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) if not exp_tracking.has_clearml: exp_tracking_logger = exp_tracking.setup_logging( trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log validation predictions as images # We define a custom event filter to log less frequently the images (to reduce storage size) # - we plot images with masks of the middle validation batch # - once every 3 validations and # - at the end of the training def custom_event_filter(_, val_iteration): c1 = val_iteration == len(val_loader) // 2 c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) * 3) == 0 c2 |= trainer.state.epoch == config.num_epochs return c1 and c2 tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( event_filter=custom_event_filter), ) # Log confusion matrix to ClearML: if exp_tracking.has_clearml: @trainer.on(Events.COMPLETED) def compute_and_log_cm(): cm = cm_metric.compute() # CM: values are normalized such that diagonal values represent class recalls cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy() if idist.get_rank() == 0: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task clearml_logger = Task.current_task().get_logger() clearml_logger.report_confusion_matrix( title="Final Confusion Matrix", series="cm-preds-gt", matrix=cm, iteration=trainer.state.iteration, xlabels=VOCSegmentationOpencv.target_names, ylabels=VOCSegmentationOpencv.target_names, ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() if not exp_tracking.has_clearml: exp_tracking_logger.close()
def run(train_config, logger, **kwargs): logger = logging.getLogger('UDA') if getattr(train_config, 'debug', False): setup_logger(logger, logging.DEBUG) # Set Polyaxon environment if needed plx_logger = None save_dir = None output_experiment_path = None try: plx_logger = PolyaxonLogger() experiment = plx_logger.experiment save_dir = get_outputs_path() output_experiment_path = get_outputs_refs_paths() output_experiment_path = output_experiment_path['experiments'][ 0] if output_experiment_path else None logger.debug("Experiment info: {}".format( experiment.get_experiment_info())) except PolyaxonClientException as e: logger.warning('Logger Polyaxon : ' + str(e)) # Path configuration saves_dict = getattr(train_config, 'saves', {}) save_dir = saves_dict.get('save_dir', '') if save_dir is None else save_dir log_dir = os.path.join(save_dir, saves_dict.get('log_dir', '')) save_model_dir = os.path.join(save_dir, saves_dict.get('model_dir', '')) save_prediction_dir = os.path.join(save_dir, saves_dict.get('prediction_dir', '')) save_config_dir = os.path.join(save_dir, saves_dict.get('config_dir', '')) load_model_file = saves_dict.get('load_model_file', '') load_optimizer_file = saves_dict.get('load_optimizer_file', '') # Create folders create_save_folders(save_dir, saves_dict) if output_experiment_path is not None: model_dir = saves_dict.get('model_dir', '') load_model_file = os.path.join( output_experiment_path, model_dir, load_model_file) if load_model_file else None load_optimizer_file = os.path.join( output_experiment_path, model_dir, load_optimizer_file) if load_optimizer_file else None num_epochs = getattr(train_config, 'num_epochs') num_classes = getattr(train_config, 'num_classes') device = getattr(train_config, 'device', 'cpu') # Set magical acceleration if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True else: assert device == 'cpu', 'CUDA device selected but none is available' # Set half precision if required use_fp_16 = getattr(train_config, 'use_fp_16', False) train1_sup_loader = getattr(train_config, 'train1_sup_loader') train1_unsup_loader = getattr(train_config, 'train1_unsup_loader') train2_unsup_loader = getattr(train_config, 'train2_unsup_loader') test_loader = getattr(train_config, 'test_loader') save_interval = saves_dict.get('save_interval', 0) n_saved = saves_dict.get('n_saved', 0) val_interval = getattr(train_config, 'val_interval', 1) pred_interval = getattr(train_config, 'pred_interval', 0) model = getattr(train_config, 'model').to(device) optimizer = getattr(train_config, 'optimizer') criterion = getattr(train_config, 'criterion').to(device) consistency_criterion = getattr(train_config, 'consistency_criterion').to(device) cm_metric = getattr( train_config, 'cm_metric', ConfusionMatrix(num_classes=num_classes, output_transform=lambda x: (x['y_pred'], x['y']))) # AMP initialization for half precision if use_fp_16: assert 'cuda' in device assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." try: from apex import amp except: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to run this example." ) # Initialize amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # Load checkpoint load_params(model, optimizer=optimizer, model_file=load_model_file, optimizer_file=load_optimizer_file, device_name=device) # Add batch norm is_bn = getattr(train_config, 'is_bn', False) if is_bn: batch_norm = nn.BatchNorm2d(3).to(device) if use_fp_16: batch_norm = amp.initialize(batch_norm) batch_norm.reset_parameters() model = nn.Sequential(batch_norm, model) # Copy the config file shutil.copy2(os.path.abspath(train_config.__file__), os.path.join(save_config_dir, 'checkpoint_module.py')) le = len(train1_sup_loader) num_train_steps = le * num_epochs mlflow.log_param("num train steps", num_train_steps) lr = getattr(train_config, 'learning_rate') num_warmup_steps = getattr(train_config, 'num_warmup_steps', 0) lr_scheduler = getattr(train_config, 'lr_scheduler', None) if lr_scheduler is not None: lr_scheduler = lr_scheduler(optimizer) if num_warmup_steps > 0: lr_scheduler = create_lr_scheduler_with_warmup( lr_scheduler, warmup_start_value=0.0, warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps), warmup_duration=num_warmup_steps) train1_sup_loader_iter = cycle(train1_sup_loader) train1_unsup_loader_iter = cycle(train1_unsup_loader) train2_unsup_loader_iter = cycle(train2_unsup_loader) # Reduce on plateau reduce_on_plateau = getattr(train_config, 'reduce_on_plateau', None) # Output transform model output_transform_model = getattr(train_config, 'output_transform_model', lambda x: x) inference_fn = getattr(train_config, 'inference_fn', inference_standard) lam = getattr(train_config, 'consistency_lambda') beta = getattr(train_config, 'consistency_beta', lam) tsa = TrainingSignalAnnealing( num_steps=num_train_steps, min_threshold=getattr(train_config, 'TSA_proba_min'), max_threshold=getattr(train_config, 'TSA_proba_max')) with_tsa = getattr(train_config, 'with_TSA', False) cfg = { 'tsa': tsa, 'lambda': lam, 'beta': beta, 'with_tsa': with_tsa, 'device': device, 'consistency_criterion': consistency_criterion, 'criterion': criterion } trainer = Engine( partial(train_update_function, model=model, optimizer=optimizer, cfg=cfg, train1_sup_loader_iter=train1_sup_loader_iter, train1_unsup_loader_iter=train1_unsup_loader_iter, train2_unsup_loader_iter=train2_unsup_loader_iter, output_transform_model=output_transform_model, use_fp_16=use_fp_16)) # Register events for e in CustomEvents: State.event_to_attr[e] = 'iteration' trainer.register_events(*CustomEvents) if with_tsa: trainer.add_event_handler(Events.ITERATION_COMPLETED, log_tsa, tsa) if lr_scheduler is not None: if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step()) trainer.add_event_handler(Events.ITERATION_COMPLETED, log_learning_rate, optimizer) metric_names = [ 'supervised batch loss', 'consistency batch loss', 'final batch loss' ] def output_transform(x, name): return x[name] for n in metric_names: RunningAverage( output_transform=partial(output_transform, name=n)).attach( trainer, n) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) # Handlers for Tensorboard logging tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=CustomEvents.ITERATION_K_STARTED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(trainer, log_handler=plxOutputHandler( tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) metrics = { 'loss': Loss(criterion, output_transform=lambda x: (x['y_pred'], x['y'])), 'mAcc': cmAccuracy(cm_metric).mean(), 'mPr': cmPrecision(cm_metric).mean(), 'mRe': cmRecall(cm_metric).mean(), 'mIoU': mIoU(cm_metric), 'mF1': cmFbeta(cm_metric, 1).mean() } iou = IoU(cm_metric) for i in range(num_classes): key_name = 'IoU_{}'.format(str(i)) metrics[key_name] = iou[i] inference_update_fn = partial( inference_update_function, model=model, cfg=cfg, output_transform_model=output_transform_model, inference_fn=inference_fn) evaluator = Engine(inference_update_fn) train_evaluator = Engine(inference_update_fn) for name, metric in metrics.items(): metric.attach(train_evaluator, name) metric.attach(evaluator, name) # Add checkpoint if save_model_dir: checkpoint = ModelCheckpoint(dirname=save_model_dir, filename_prefix='checkpoint', save_interval=save_interval, n_saved=n_saved, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint, { 'mymodel': model, 'optimizer': optimizer }) def trigger_k_iteration_started(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_STARTED) def trigger_k_iteration_completed(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_COMPLETED) def run_validation(engine, validation_interval): if (trainer.state.epoch - 1) % validation_interval == 0: train_evaluator.run(train1_sup_loader) evaluator.run(test_loader) if save_prediction_dir: train_output = train_evaluator.state.output test_output = evaluator.state.output iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('train_{}_{}'.format(iteration, epoch), save_prediction_dir, train_output['x'], torch.argmax( train_output['y_pred'][0, :, :, :], dim=0), y=train_output['y'][0, :, :]) save_prediction('test_{}_{}'.format(iteration, epoch), save_prediction_dir, test_output['x'], torch.argmax(test_output['y_pred'][0, :, :, :], dim=0), y=test_output['y'][0, :, :]) train_evaluator.state.output = None evaluator.state.output = None if reduce_on_plateau is not None: reduce_on_plateau.step(evaluator.state.metrics['mIoU']) trainer.add_event_handler(Events.ITERATION_STARTED, trigger_k_iteration_started, k=10) trainer.add_event_handler(Events.ITERATION_COMPLETED, trigger_k_iteration_completed, k=10) trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, validation_interval=val_interval) trainer.add_event_handler(Events.COMPLETED, run_validation, validation_interval=1) def trainer_prediction_save(engine, prediction_interval): if (engine.state.iteration - 1) % prediction_interval == 0: if save_prediction_dir: trainer_output = trainer.state.output['unsup pred'] iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('trainer_{}_{}'.format(iteration, epoch), save_prediction_dir, trainer_output['x'], trainer_output['y_pred']) logger.debug( 'Saved trainer prediction for iteration {}'.format( str(engine.state.iteration))) trainer.state.output = None trainer.add_event_handler(Events.ITERATION_COMPLETED, trainer_prediction_save, prediction_interval=pred_interval) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(train_evaluator, log_handler=plxOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) plx_logger.attach(evaluator, log_handler=plxOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train", trainer) train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train", trainer) evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test", trainer) data_steps = list(range(len(train1_sup_loader))) logger.debug('Start training') trainer.run(data_steps, max_epochs=num_epochs) logger.debug('Finished training')