Ejemplo n.º 1
0
def run(*options, cfg=None, local_rank=0, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """
    update_config(config, options=options, config_file=cfg)

    # we will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split(
        "/")[-1].split(".")[0]

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    silence_other_ranks = True
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    distributed = world_size > 1

    if distributed:
        # FOR DISTRIBUTED: Set the device according to local_rank.
        torch.cuda.set_device(local_rank)

        # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will
        # provide environment variables, and requires that you use init_method=`env://`.
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)
    # Setup Augmentations
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1),
        PadIfNeeded(
            min_height=config.TRAIN.PATCH_SIZE,
            min_width=config.TRAIN.PATCH_SIZE,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
        Resize(
            config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT,
            config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH,
            always_apply=True,
        ),
        PadIfNeeded(
            min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
            min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    TrainPatchLoader = get_patch_loader(config)

    train_set = TrainPatchLoader(
        config.DATASET.ROOT,
        split="train",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=train_aug,
    )

    val_set = TrainPatchLoader(
        config.DATASET.ROOT,
        split="val",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=val_aug,
    )

    logger.info(f"Validation examples {len(val_set)}")
    n_classes = train_set.n_classes

    if debug:
        val_set = data.Subset(val_set,
                              range(config.VALIDATION.BATCH_SIZE_PER_GPU))
        train_set = data.Subset(train_set,
                                range(config.TRAIN.BATCH_SIZE_PER_GPU * 2))

    logger.info(f"Training examples {len(train_set)}")
    logger.info(f"Validation examples {len(val_set)}")

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set, num_replicas=world_size, rank=local_rank)

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=train_sampler,
    )

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_set, num_replicas=world_size, rank=local_rank)

    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=val_sampler,
    )

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[device], find_unused_parameters=True)

    snapshot_duration = epochs_per_cycle * len(
        train_loader) if not debug else 2 * len(train_loader)

    warmup_duration = 5 * len(train_loader)

    warmup_scheduler = LinearCyclicalScheduler(
        optimizer,
        "lr",
        start_value=config.TRAIN.MAX_LR,
        end_value=config.TRAIN.MAX_LR * world_size,
        cycle_size=10 * len(train_loader),
    )
    cosine_scheduler = CosineAnnealingScheduler(
        optimizer,
        "lr",
        config.TRAIN.MAX_LR * world_size,
        config.TRAIN.MIN_LR * world_size,
        cycle_size=snapshot_duration,
    )

    scheduler = ConcatScheduler(
        schedulers=[warmup_scheduler, cosine_scheduler],
        durations=[warmup_duration])

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    # Set to update the epoch parameter of our distributed data sampler so that we get
    # different shuffles
    trainer.add_event_handler(Events.EPOCH_STARTED,
                              update_sampler_epoch(train_loader))

    if silence_other_ranks & local_rank != 0:
        logging.getLogger("ignite.engine.engine.Engine").setLevel(
            logging.WARNING)

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(),
                model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion,
                 output_transform=_select_pred_and_mask,
                 device=device),
            "pixa":
            pixelwise_accuracy(n_classes,
                               output_transform=_select_pred_and_mask,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=_select_pred_and_mask,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=_select_pred_and_mask,
                                device=device),
            "ciou":
            class_iou(n_classes,
                      output_transform=_select_pred_and_mask,
                      device=device),
            "mIoU":
            mean_iou(n_classes,
                     output_transform=_select_pred_and_mask,
                     device=device),
        },
        device=device,
    )

    # Set the validation run to start on the epoch completion of the training run

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              Evaluator(evaluator, val_loader))

    if local_rank == 0:  # Run only on master process

        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            logging_handlers.log_training_output(
                log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),
        )
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  logging_handlers.log_lr(optimizer))

        try:
            output_dir = generate_path(
                config.OUTPUT_DIR,
                git_branch(),
                git_hash(),
                config_file_name,
                config.TRAIN.MODEL_DIR,
                current_datetime(),
            )
        except TypeError:
            output_dir = generate_path(
                config.OUTPUT_DIR,
                config_file_name,
                config.TRAIN.MODEL_DIR,
                current_datetime(),
            )

        summary_writer = create_summary_writer(
            log_dir=path.join(output_dir, config.LOG_DIR))
        logger.info(
            f"Logging Tensorboard to {path.join(output_dir, config.LOG_DIR)}")
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
        )
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_training_output(summary_writer),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            logging_handlers.log_metrics(
                "Validation results",
                metrics_dict={
                    "nll": "Avg loss :",
                    "mIoU": " Avg IoU :",
                    "pixa": "Pixelwise Accuracy :",
                    "mca": "Mean Class Accuracy :",
                },
            ),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            tensorboard_handlers.log_metrics(
                summary_writer,
                trainer,
                "epoch",
                metrics_dict={
                    "mIoU": "Validation/IoU",
                    "nll": "Validation/Loss",
                    "mca": "Validation/MCA",
                },
            ),
        )

        def _select_max(pred_tensor):
            return pred_tensor.max(1)[1]

        def _tensor_to_numpy(pred_tensor):
            return pred_tensor.squeeze().cpu().numpy()

        transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes),
                                 _tensor_to_numpy)
        transform_pred = compose(transform_func, _select_max)
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(summary_writer, "Validation/Image", "image"),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(summary_writer,
                                "Validation/Mask",
                                "mask",
                                transform_func=transform_func),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(
                summary_writer,
                "Validation/Pred",
                "y_pred",
                transform_func=transform_pred,
            ),
        )

        def snapshot_function():
            return (trainer.state.iteration % snapshot_duration) == 0

        checkpoint_handler = SnapshotHandler(
            output_dir,
            config.MODEL.NAME,
            extract_metric_from("mIoU"),
            snapshot_function,
        )
        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                    {"model": model})
        logger.info("Starting training")

        if debug:
            trainer.run(
                train_loader,
                max_epochs=config.TRAIN.END_EPOCH,
                epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU * 2,
                seed=config.SEED,
            )
        else:
            trainer.run(train_loader,
                        max_epochs=config.TRAIN.END_EPOCH,
                        epoch_length=len(train_loader),
                        seed=config.SEED)
Ejemplo n.º 2
0
def run(*options, cfg=None, debug=False):
    """Run testing of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Setup Augmentations
    test_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=cv2.BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=cv2.BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
        ]
    )

    PenobscotDataset = get_patch_dataset(config)

    test_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="test",
        transforms=test_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.TEST.COMPLETE_PATCHES_ONLY,
    )

    logger.info(str(test_set))
    n_classes = test_set.n_classes

    test_loader = data.DataLoader(
        test_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS,
    )

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    logger.info(f"Loading model {config.TEST.MODEL_PATH}")
    model.load_state_dict(torch.load(config.TEST.MODEL_PATH), strict=False)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    model = model.to(device)  # Send to GPU

    try:
        output_dir = generate_path(config.OUTPUT_DIR, git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),)
    except TypeError:
        output_dir = generate_path(config.OUTPUT_DIR, config.MODEL.NAME, current_datetime(),)

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean")

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze())

    def _select_all(model_out_dict):
        return (
            model_out_dict["y_pred"].squeeze(),
            model_out_dict["mask"].squeeze(),
            model_out_dict["ids"],
            model_out_dict["patch_locations"],
        )

    inline_mean_iou = InlineMeanIoU(
        config.DATASET.INLINE_HEIGHT,
        config.DATASET.INLINE_WIDTH,
        config.TRAIN.PATCH_SIZE,
        n_classes,
        padding=_padding_from(config),
        scale=_scale_from(config),
        output_transform=_select_all,
    )

    evaluator = create_supervised_evaluator(
        model,
        _prepare_batch,
        metrics={
            "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device),
            "inIoU": inline_mean_iou,
            "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device),
            "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device),
        },
        device=device,
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Test results",
            metrics_dict={
                "nll": "Avg loss :",
                "mIoU": "Avg IoU :",
                "pixa": "Pixelwise Accuracy :",
                "mca": "Mean Class Accuracy :",
                "inIoU": "Mean Inline IoU :",
            },
        ),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            evaluator,
            "epoch",
            metrics_dict={"mIoU": "Test/IoU", "nll": "Test/Loss", "mca": "Test/MCA", "inIoU": "Test/MeanInlineIoU",},
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(
        np_to_tb, decode_segmap(n_classes=n_classes, label_colours=_SEG_COLOURS), _tensor_to_numpy,
    )

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Image", "image"),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Mask", "mask", transform_func=transform_func),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Test/Pred", "y_pred", transform_func=transform_pred),
    )

    logger.info("Starting training")
    if debug:
        logger.info("Running in Debug/Test mode")
        test_loader = take(3, test_loader)

    evaluator.run(test_loader, max_epochs=1)

    # Log top N and bottom N inlines in terms of IoU to tensorboard
    inline_ious = inline_mean_iou.iou_per_inline()
    sorted_ious = sorted(inline_ious.items(), key=lambda x: x[1], reverse=True)
    topk = ((inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in take(_TOP_K, sorted_ious))
    bottomk = (
        (inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in tail(_BOTTOM_K, sorted_ious)
    )
    stack_and_decode = compose(transform_func, torch.stack)
    predictions, masks = unzip(chain(topk, bottomk))
    predictions_tensor = stack_and_decode(list(predictions))
    masks_tensor = stack_and_decode(list(masks))
    _log_tensor_to_tensorboard(predictions_tensor, "Test/InlinePredictions", summary_writer, evaluator)
    _log_tensor_to_tensorboard(masks_tensor, "Test/InlineMasks", summary_writer, evaluator)

    summary_writer.close()
Ejemplo n.º 3
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options loaded from default.py will be overridden by those loaded from cfg file
        Options passed in via options argument will override those loaded from cfg file
    
    Args:
        *options (str, int, optional): Options used to overide what is loaded from the
                                    config. To see what options are available consult
                                    default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """

    update_config(config, options=options, config_file=cfg)

    # we will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"

    # Setup Augmentations
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    PenobscotDataset = get_patch_dataset(config)

    train_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="train",
        transforms=train_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY,
    )

    val_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="val",
        transforms=val_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY,
    )
    logger.info(train_set)
    logger.info(val_set)
    n_classes = train_set.n_classes

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True,
    )

    if debug:
        val_set = data.Subset(val_set, range(3))

    val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS)

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean")

    trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),
    )
    trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer))
    trainer.add_event_handler(
        Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
    )
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer),
    )

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        _prepare_batch,
        metrics={
            "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "nll": Loss(criterion, output_transform=_select_pred_and_mask),
            "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask),
            "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask),
        },
        device=device,
    )

    # Set the validation run to start on the epoch completion of the training run
    trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "nll": "Avg loss :",
                "pixacc": "Pixelwise Accuracy :",
                "mca": "Avg Class Accuracy :",
                "mIoU": "Avg Class IoU :",
            },
        ),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={
                "mIoU": "Validation/mIoU",
                "nll": "Validation/Loss",
                "mca": "Validation/MCA",
                "pixacc": "Validation/Pixel_Acc",
            },
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,)

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred),
    )

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,)
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    if debug:
        trainer.run(
            train_loader,
            max_epochs=config.TRAIN.END_EPOCH,
            epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU,
            seed=config.SEED,
        )
    else:
        trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
Ejemplo n.º 4
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Setup Augmentations
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1)
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    TrainLoader = get_section_loader(config)

    train_set = TrainLoader(
        data_dir=config.DATASET.ROOT,
        split="train",
        is_transform=True,
        augmentations=train_aug,
    )

    val_set = TrainLoader(
        data_dir=config.DATASET.ROOT,
        split="val",
        is_transform=True,
        augmentations=val_aug,
    )

    class CustomSampler(torch.utils.data.Sampler):
        def __init__(self, data_source):
            self.data_source = data_source

        def __iter__(self):
            char = ["i" if np.random.randint(2) == 1 else "x"]
            self.indices = [
                idx for (idx, name) in enumerate(self.data_source)
                if char[0] in name
            ]
            return (self.indices[i] for i in torch.randperm(len(self.indices)))

        def __len__(self):
            return len(self.data_source)

    n_classes = train_set.n_classes

    val_list = val_set.sections
    train_list = val_set.sections

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        sampler=CustomSampler(train_list),
        num_workers=config.WORKERS,
        shuffle=False,
    )

    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        sampler=CustomSampler(val_list),
        num_workers=config.WORKERS,
    )

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            git_branch(),
            git_hash(),
            config.MODEL.NAME,
            current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            config.MODEL.NAME,
            current_datetime(),
        )

    summary_writer = create_summary_writer(
        log_dir=path.join(output_dir, config.LOG_DIR))

    snapshot_duration = scheduler_step * len(train_loader)
    scheduler = CosineAnnealingScheduler(optimizer, "lr", config.TRAIN.MAX_LR,
                                         config.TRAIN.MIN_LR,
                                         snapshot_duration)

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),
    )

    trainer.add_event_handler(Events.EPOCH_STARTED,
                              logging_handlers.log_lr(optimizer))

    trainer.add_event_handler(
        Events.EPOCH_STARTED,
        tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
    )

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        tensorboard_handlers.log_training_output(summary_writer),
    )

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(),
                model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion,
                 output_transform=_select_pred_and_mask,
                 device=device),
            "pixacc":
            pixelwise_accuracy(n_classes,
                               output_transform=_select_pred_and_mask,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=_select_pred_and_mask,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=_select_pred_and_mask,
                                device=device),
            "ciou":
            class_iou(n_classes,
                      output_transform=_select_pred_and_mask,
                      device=device),
            "mIoU":
            mean_iou(n_classes,
                     output_transform=_select_pred_and_mask,
                     device=device),
        },
        device=device,
    )

    if debug:
        logger.info("Running Validation in Debug/Test mode")
        val_loader = take(3, val_loader)
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              Evaluator(evaluator, val_loader))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "nll": "Avg loss :",
                "pixacc": "Pixelwise Accuracy :",
                "mca": "Avg Class Accuracy :",
                "mIoU": "Avg Class IoU :",
            },
        ),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_class_metrics(
            "Per class validation results",
            metrics_dict={
                "ciou": "Class IoU :",
                "cacc": "Class Accuracy :"
            },
        ),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={
                "mIoU": "Validation/mIoU",
                "nll": "Validation/Loss",
                "mca": "Validation/MCA",
                "pixacc": "Validation/Pixel_Acc",
            },
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes),
                             _tensor_to_numpy)

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Image", "image"),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer,
                            "Validation/Mask",
                            "mask",
                            transform_func=transform_func),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer,
                            "Validation/Pred",
                            "y_pred",
                            transform_func=transform_pred),
    )

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(
        path.join(output_dir, config.TRAIN.MODEL_DIR),
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        snapshot_function,
    )

    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                {"model": model})

    logger.info("Starting training")
    if debug:
        logger.info("Running Validation in Debug/Test mode")
        train_loader = take(3, train_loader)
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH)
Ejemplo n.º 5
0
def _evaluate_split(
    split,
    section_aug,
    model,
    pre_processing,
    output_processing,
    device,
    running_metrics_overall,
    config,
    debug=False,
):
    logger = logging.getLogger(__name__)

    TestSectionLoader = get_test_loader(config)
    test_set = TestSectionLoader(
        config.DATASET.ROOT,
        split=split,
        is_transform=True,
        augmentations=section_aug,
    )

    n_classes = test_set.n_classes

    test_loader = data.DataLoader(test_set,
                                  batch_size=1,
                                  num_workers=config.WORKERS,
                                  shuffle=False)

    if debug:
        logger.info("Running in Debug/Test mode")
        test_loader = take(1, test_loader)

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR + "_test",
            git_branch(),
            git_hash(),
            config.MODEL.NAME,
            current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(
            config.OUTPUT_DIR + "_test",
            config.MODEL.NAME,
            current_datetime(),
        )

    running_metrics_split = runningScore(n_classes)

    # testing mode:
    with torch.no_grad():  # operations inside don't track history
        model.eval()
        total_iteration = 0
        for i, (images, labels) in enumerate(test_loader):
            logger.info(f"split: {split}, section: {i}")
            total_iteration = total_iteration + 1

            outputs = _patch_label_2d(
                model,
                images,
                pre_processing,
                output_processing,
                config.TRAIN.PATCH_SIZE,
                config.TEST.TEST_STRIDE,
                config.VALIDATION.BATCH_SIZE_PER_GPU,
                device,
                n_classes,
            )

            pred = outputs.detach().max(1)[1].numpy()
            gt = labels.numpy()
            running_metrics_split.update(gt, pred)
            running_metrics_overall.update(gt, pred)

            #  dump images to disk for review
            mask_to_disk(pred.squeeze(),
                         os.path.join(output_dir, f"{i}_pred.png"))
            mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"))

    # get scores
    score, class_iou = running_metrics_split.get_scores()

    # Log split results
    logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}')
    for cdx, class_name in enumerate(_CLASS_NAMES):
        logger.info(
            f'  {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}')

    logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}')
    logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}')
    logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}')
    running_metrics_split.reset()