コード例 #1
0
    def __getitem__(self, index):

        section_name = self.sections[index]
        direction, number = section_name.split(sep="_")

        if direction == "i":
            im = self.seismic[int(number), :, :, :]
            lbl = self.labels[int(number), :, :]
        elif direction == "x":
            im = self.seismic[:, :, int(number), :]
            lbl = self.labels[:, int(number), :]

            im = np.swapaxes(im, 0, 1)  # From WCH to CWH

        im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)

        # dump images before augmentation
        if self.debug:
            outdir = f"debug/testSectionLoaderWithDepth_{self.split}_raw"
            generate_path(outdir)
            # this needs to take the first dimension of image (no depth) but lbl only has 1 dim
            path_prefix = f"{outdir}/index_{index}_section_{section_name}"
            image_to_disk(im[0, :, :], path_prefix + "_img.png")
            mask_to_disk(lbl, path_prefix + "_lbl.png", self.n_classes)

        if self.augmentations is not None:
            im = _transform_CHW_to_HWC(im)
            augmented_dict = self.augmentations(image=im, mask=lbl)
            im, lbl = augmented_dict["image"], augmented_dict["mask"]
            im = _transform_HWC_to_CHW(im)

        if self.is_transform:
            im, lbl = self.transform(im, lbl)

        # dump images and labels to disk after augmentation
        if self.debug:
            outdir = (
                f"debug/testSectionLoaderWithDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
            )
            generate_path(outdir)
            path_prefix = f"{outdir}/index_{index}_section_{section_name}"
            image_to_disk(np.array(im[0, :, :]), path_prefix + "_img.png")
            mask_to_disk(np.array(lbl[0, :, :]), path_prefix + "_lbl.png",
                         self.n_classes)

        return im, lbl
コード例 #2
0
    def __getitem__(self, index):

        patch_name = self.patches[index]
        direction, idx, xdx, ddx = patch_name.split(sep="_")

        # Shift offsets the padding that is added in training
        # shift = self.patch_size if "test" not in self.split else 0
        # Remember we are cancelling the shift since we no longer pad
        shift = 0
        idx, xdx, ddx = int(idx) + shift, int(xdx) + shift, int(ddx) + shift

        if direction == "i":
            im = self.seismic[idx, :, xdx:xdx + self.patch_size,
                              ddx:ddx + self.patch_size]
            lbl = self.labels[idx, xdx:xdx + self.patch_size,
                              ddx:ddx + self.patch_size]
        elif direction == "x":
            im = self.seismic[idx:idx + self.patch_size, :, xdx,
                              ddx:ddx + self.patch_size]
            lbl = self.labels[idx:idx + self.patch_size, xdx,
                              ddx:ddx + self.patch_size]
            im = np.swapaxes(im, 0, 1)  # From WCH to CWH

        im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)

        if self.augmentations is not None:
            im = _transform_CHW_to_HWC(im)
            augmented_dict = self.augmentations(image=im, mask=lbl)
            im, lbl = augmented_dict["image"], augmented_dict["mask"]
            im = _transform_HWC_to_CHW(im)

        # dump images and labels to disk
        if self.debug:
            outdir = f"patchLoaderWithSectionDepth_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
            generate_path(outdir)
            image_to_disk(im[0, :, :], f"{outdir}/{index}_img.png")
            mask_to_disk(lbl, f"{outdir}/{index}_lbl.png")

        if self.is_transform:
            im, lbl = self.transform(im, lbl)
        return im, lbl
コード例 #3
0
    def __getitem__(self, index):

        patch_name = self.patches[index]
        direction, idx, xdx, ddx = patch_name.split(sep="_")

        # Shift offsets the padding that is added in training
        # shift = self.patch_size if "test" not in self.split else 0
        # Remember we are cancelling the shift since we no longer pad
        shift = 0
        idx, xdx, ddx = int(idx) + shift, int(xdx) + shift, int(ddx) + shift

        if direction == "i":
            im = self.seismic[idx, xdx:xdx + self.patch_size,
                              ddx:ddx + self.patch_size]
            lbl = self.labels[idx, xdx:xdx + self.patch_size,
                              ddx:ddx + self.patch_size]
        elif direction == "x":
            im = self.seismic[idx:idx + self.patch_size, xdx,
                              ddx:ddx + self.patch_size]
            lbl = self.labels[idx:idx + self.patch_size, xdx,
                              ddx:ddx + self.patch_size]

        im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)

        # dump raw images before augmentation
        if self.debug:
            outdir = f"debug/patchLoader_{self.split}_raw"
            generate_path(outdir)
            path_prefix = f"{outdir}/index_{index}_section_{patch_name}"
            image_to_disk(im, path_prefix + "_img.png")
            mask_to_disk(lbl, path_prefix + "_lbl.png", self.n_classes)

        if self.augmentations is not None:
            augmented_dict = self.augmentations(image=im, mask=lbl)
            im, lbl = augmented_dict["image"], augmented_dict["mask"]

        # dump images and labels to disk
        if self.debug:
            outdir = f"patchLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
            generate_path(outdir)
            path_prefix = f"{outdir}/{index}"
            image_to_disk(im, path_prefix + "_img.png")
            mask_to_disk(lbl, path_prefix + "_lbl.png", self.n_classes)

        if self.is_transform:
            im, lbl = self.transform(im, lbl)

        # dump images and labels to disk
        if self.debug:
            outdir = f"debug/patchLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
            generate_path(outdir)
            path_prefix = f"{outdir}/index_{index}_section_{patch_name}"
            image_to_disk(np.array(im[0, :, :]), path_prefix + "_img.png")
            mask_to_disk(np.array(lbl[0, :, :]), path_prefix + "_lbl.png",
                         self.n_classes)

        return im, lbl
コード例 #4
0
    def __getitem__(self, index):

        section_name = self.sections[index]
        direction, number = section_name.split(sep="_")

        if direction == "i":
            im = self.seismic[int(number), :, :]
            lbl = self.labels[int(number), :, :]
        elif direction == "x":
            im = self.seismic[:, int(number), :]
            lbl = self.labels[:, int(number), :]

        im, lbl = _transform_WH_to_HW(im), _transform_WH_to_HW(lbl)

        if self.debug and "test" in self.split:
            outdir = f"debug/sectionLoader_{self.split}_raw"
            generate_path(outdir)
            path_prefix = f"{outdir}/index_{index}_section_{section_name}"
            image_to_disk(im, path_prefix + "_img.png")
            mask_to_disk(lbl, path_prefix + "_lbl.png", self.n_classes)

        if self.augmentations is not None:
            augmented_dict = self.augmentations(image=im, mask=lbl)
            im, lbl = augmented_dict["image"], augmented_dict["mask"]

        if self.is_transform:
            im, lbl = self.transform(im, lbl)

        if self.debug and "test" in self.split:
            outdir = f"debug/sectionLoader_{self.split}_{'aug' if self.augmentations is not None else 'noaug'}"
            generate_path(outdir)
            path_prefix = f"{outdir}/index_{index}_section_{section_name}"
            image_to_disk(np.array(im[0]), path_prefix + "_img.png")
            mask_to_disk(np.array(lbl[0]), path_prefix + "_lbl.png",
                         self.n_classes)

        return im, lbl
コード例 #5
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.        
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """
    # Configuration:
    update_config(config, options=options, config_file=cfg)

    # The model will be saved under: outputs/<config_file_name>/<model_dir>
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    # Logging:
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)

    # Set CUDNN benchmark mode:
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    # We will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Fix random seeds:
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Augmentation:
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    # Training and Validation Loaders:
    TrainPatchLoader = get_patch_loader(config)
    logging.info(f"Using {TrainPatchLoader}")
    train_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="train",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=train_aug,
        debug=debug,
    )
    logger.info(train_set)
    n_classes = train_set.n_classes
    val_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="val",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=val_aug,
        debug=debug,
    )
    logger.info(val_set)

    if debug:
        logger.info("Running in debug mode..")
        train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES))
        val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU))

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True
    )
    val_loader = data.DataLoader(
        val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1
    )  # config.WORKERS)

    # Model:
    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Optimizer and LR Scheduler:
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # Tensorboard writer:
    summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs"))

    # class weights are inversely proportional to the frequency of the classes in the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    # Loss:
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean")

    # Ignite trainer and evaluator:
    trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device)
    transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze())
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll": Loss(criterion, output_transform=transform_fn),
            "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device),
            "cacc": class_accuracy(n_classes, output_transform=transform_fn),
            "mca": mean_class_accuracy(n_classes, output_transform=transform_fn),
            "ciou": class_iou(n_classes, output_transform=transform_fn),
            "mIoU": mean_iou(n_classes, output_transform=transform_fn),
        },
        device=device,
    )
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Logging:
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer))

    # Tensorboard and Logging:
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer))
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer))

    # add specific logger which also triggers printed metrics on training set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training")
        logging_handlers.log_metrics(engine, evaluator, stage="Training")

    # add specific logger which also triggers printed metrics on validation set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation")
        logging_handlers.log_metrics(engine, evaluator, stage="Validation")
        # dump validation set metrics at the very end for debugging purposes
        if engine.state.epoch == config.TRAIN.END_EPOCH and debug:
            fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
            metrics = evaluator.state.metrics
            out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]}
            with open(fname, "w") as fid:
                json.dump(out_dict, fid)
            log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys())
            logging.info(log_msg)

    # Checkpointing: snapshotting trained models to disk
    checkpoint_handler = SnapshotHandler(
        output_dir,
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        lambda: (trainer.state.iteration % snapshot_duration) == 0,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)

    summary_writer.close()
コード例 #6
0
def _evaluate_split(
    split,
    section_aug,
    model,
    pre_processing,
    output_processing,
    device,
    running_metrics_overall,
    config,
    data_flow,
    debug=False,
):
    logger = logging.getLogger(__name__)

    TestSectionLoader = get_test_loader(config)

    test_set = TestSectionLoader(
        config,
        split=split,
        is_transform=True,
        augmentations=section_aug,
        debug=debug,
    )

    n_classes = test_set.n_classes

    if debug:
        data_flow[split] = dict()
        data_flow[split]["test_section_loader_length"] = len(test_set)
        data_flow[split]["test_input_shape"] = test_set.seismic.shape
        data_flow[split]["test_label_shape"] = test_set.labels.shape
        data_flow[split]["n_classes"] = n_classes

    test_loader = data.DataLoader(test_set,
                                  batch_size=1,
                                  num_workers=config.WORKERS,
                                  shuffle=False)

    if debug:
        data_flow[split]["test_loader_length"] = len(test_loader)
        logger.info("Running in Debug/Test mode")
        take_n = 2
        test_loader = take(take_n, test_loader)
        data_flow[split]["take_n_sections"] = take_n
        pred_list, gt_list, img_list = [], [], []

    try:
        output_dir = generate_path(
            f"{config.OUTPUT_DIR}/test/{split}",
            git_branch(),
            git_hash(),
            config.MODEL.NAME,
            current_datetime(),
        )
    except:
        output_dir = generate_path(
            f"{config.OUTPUT_DIR}/test/{split}",
            config.MODEL.NAME,
            current_datetime(),
        )

    running_metrics_split = runningScore(n_classes)

    # evaluation mode:
    with torch.no_grad():  # operations inside don't track history
        model.eval()
        for i, (images, labels) in enumerate(test_loader):
            logger.info(f"split: {split}, section: {i}")
            outputs = _patch_label_2d(
                model,
                images,
                pre_processing,
                output_processing,
                config.TRAIN.PATCH_SIZE,
                config.TEST.TEST_STRIDE,
                config.VALIDATION.BATCH_SIZE_PER_GPU,
                device,
                n_classes,
                split,
                debug,
                config.DATASET.MIN,
                config.DATASET.MAX,
            )

            pred = outputs.detach().max(1)[1].numpy()
            gt = labels.numpy()
            if debug:
                pred_list.append((pred.shape, len(np.unique(pred))))
                gt_list.append((gt.shape, len(np.unique(gt))))
                img_list.append(images.numpy().shape)

            running_metrics_split.update(gt, pred)
            running_metrics_overall.update(gt, pred)

            #  dump images to disk for review
            mask_to_disk(pred.squeeze(),
                         os.path.join(output_dir, f"{i}_pred.png"), n_classes)
            mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"),
                         n_classes)

    if debug:
        data_flow[split]["pred_shape"] = pred_list
        data_flow[split]["gt_shape"] = gt_list
        data_flow[split]["img_shape"] = img_list

    # get scores
    score, class_iou = running_metrics_split.get_scores()

    # Log split results
    logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}')
    if debug:
        for cdx in range(n_classes):
            logger.info(
                f'  Class_{cdx}_accuracy {score["Class Accuracy: "][cdx]:.3f}')
    else:
        for cdx, class_name in enumerate(_CLASS_NAMES):
            logger.info(
                f'  {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}'
            )

    logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}')
    logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}')
    logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}')
    running_metrics_split.reset()
コード例 #7
0
def _patch_label_2d(
    model,
    img,
    pre_processing,
    output_processing,
    patch_size,
    stride,
    batch_size,
    device,
    num_classes,
    split,
    debug,
    MIN,
    MAX,
):
    """Processes a whole section
    """

    img = torch.squeeze(img)
    h, w = img.shape[-2], img.shape[-1]  # height and width

    # Pad image with patch_size/2:
    ps = int(np.floor(patch_size / 2))  # pad size
    img_p = F.pad(img, pad=(ps, ps, ps, ps), mode="constant", value=0)
    output_p = torch.zeros([1, num_classes, h + 2 * ps, w + 2 * ps])

    # generate output:
    for batch_indexes in _generate_batches(h,
                                           w,
                                           ps,
                                           patch_size,
                                           stride,
                                           batch_size=batch_size):
        batch = torch.stack(
            [
                pipe(
                    img_p,
                    _extract_patch(hdx, wdx, ps, patch_size),
                    pre_processing,
                ) for hdx, wdx in batch_indexes
            ],
            dim=0,
        )

        model_output = model(batch.to(device))

        for (hdx, wdx), output in zip(batch_indexes,
                                      model_output.detach().cpu()):
            output = output_processing(output)
            output_p[:, :, hdx + ps:hdx + ps + patch_size,
                     wdx + ps:wdx + ps + patch_size, ] += output

        # dump the data right before it's being put into the model and after scoring
        if debug:
            outdir = f"debug/test/batch_{split}"
            generate_path(outdir)
            for i in range(batch.shape[0]):
                path_prefix = f"{outdir}/{batch_indexes[i][0]}_{batch_indexes[i][1]}"
                model_output = model_output.detach().cpu()
                # save image:
                image_to_disk(np.array(batch[i, 0, :, :]),
                              path_prefix + "_img.png", MIN, MAX)
                # dump model prediction:
                mask_to_disk(model_output[i, :, :, :].argmax(dim=0).numpy(),
                             path_prefix + "_pred.png", num_classes)
                # dump model confidence values
                for nclass in range(num_classes):
                    image_to_disk(model_output[i, nclass, :, :].numpy(),
                                  path_prefix + f"_class_{nclass}_conf.png",
                                  MIN, MAX)

    # crop the output_p in the middle
    output = output_p[:, :, ps:-ps, ps:-ps]
    return output
コード例 #8
0
ファイル: train.py プロジェクト: zaky9/seismic-deeplearning
def run(*options,
        cfg=None,
        local_rank=0,
        debug=False,
        input=None,
        distributed=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.        
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
        input (str, optional): Location of data if Azure ML run, 
            for local runs input is config.DATASET.ROOT
        distributed (bool): This flag tells the training script to run in distributed mode
            if more than one GPU exists.
    """

    # if AML training pipeline supplies us with input
    if input is not None:
        data_dir = input
        output_dir = data_dir + config.OUTPUT_DIR

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)

    # Configuration:
    update_config(config, options=options, config_file=cfg)
    silence_other_ranks = True

    world_size = int(os.environ.get("WORLD_SIZE", 1))
    distributed = world_size > 1

    if distributed:
        # FOR DISTRIBUTED: Set the device according to local_rank.
        torch.cuda.set_device(local_rank)

        # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will
        # provide environment variables, and requires that you use init_method=`env://`.
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        logging.info(f"Started train.py using distributed mode.")
    else:
        logging.info(f"Started train.py using local mode.")

    # Set CUDNN benchmark mode:
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    # Fix random seeds:
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Augmentation:
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1),
        PadIfNeeded(
            min_height=config.TRAIN.PATCH_SIZE,
            min_width=config.TRAIN.PATCH_SIZE,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
            value=0,
        ),
        Resize(
            config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT,
            config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH,
            always_apply=True,
        ),
        PadIfNeeded(
            min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
            min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    # Training and Validation Loaders:
    TrainPatchLoader = get_patch_loader(config)
    logging.info(f"Using {TrainPatchLoader}")

    train_set = TrainPatchLoader(
        config,
        split="train",
        is_transform=True,
        augmentations=train_aug,
        debug=debug,
    )
    logger.info(train_set)

    n_classes = train_set.n_classes
    val_set = TrainPatchLoader(
        config,
        split="val",
        is_transform=True,
        augmentations=val_aug,
        debug=debug,
    )

    logger.info(val_set)

    if debug:
        data_flow_dict = dict()

        data_flow_dict["train_patch_loader_length"] = len(train_set)
        data_flow_dict["validation_patch_loader_length"] = len(val_set)
        data_flow_dict["train_input_shape"] = train_set.seismic.shape
        data_flow_dict["train_label_shape"] = train_set.labels.shape
        data_flow_dict["n_classes"] = n_classes

        logger.info("Running in debug mode..")
        train_range = min(
            config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES,
            len(train_set))
        logging.info(f"train range in debug mode {train_range}")
        train_set = data.Subset(train_set, range(train_range))
        valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set))
        val_set = data.Subset(val_set, range(valid_range))

        data_flow_dict["train_length_subset"] = len(train_set)
        data_flow_dict["validation_length_subset"] = len(val_set)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set, num_replicas=world_size, rank=local_rank)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_set, num_replicas=world_size, rank=local_rank)

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=train_sampler,
    )
    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=val_sampler)

    if debug:
        data_flow_dict["train_loader_length"] = len(train_loader)
        data_flow_dict["validation_loader_length"] = len(val_loader)
        config_file_name = "default_config" if not cfg else cfg.split(
            "/")[-1].split(".")[0]
        fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
        with open(fname, "w") as f:
            json.dump(data_flow_dict, f, indent=2)

    # Model:
    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Optimizer and LR Scheduler:
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    snapshot_duration = epochs_per_cycle * len(
        train_loader) if not debug else 2 * len(train_loader)
    cosine_scheduler = CosineAnnealingScheduler(
        optimizer,
        "lr",
        config.TRAIN.MAX_LR * world_size,
        config.TRAIN.MIN_LR * world_size,
        cycle_size=snapshot_duration,
    )

    if distributed:
        warmup_duration = 5 * len(train_loader)
        warmup_scheduler = LinearCyclicalScheduler(
            optimizer,
            "lr",
            start_value=config.TRAIN.MAX_LR,
            end_value=config.TRAIN.MAX_LR * world_size,
            cycle_size=10 * len(train_loader),
        )
        scheduler = ConcatScheduler(
            schedulers=[warmup_scheduler, cosine_scheduler],
            durations=[warmup_duration])
    else:
        scheduler = cosine_scheduler

    # class weights are inversely proportional to the frequency of the classes in the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    # Loss:
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    # Model:
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)
        if silence_other_ranks & local_rank != 0:
            logging.getLogger("ignite.engine.engine.Engine").setLevel(
                logging.WARNING)

    # Ignite trainer and evaluator:
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    # Set to update the epoch parameter of our distributed data sampler so that we get
    # different shuffles
    trainer.add_event_handler(Events.EPOCH_STARTED,
                              update_sampler_epoch(train_loader))

    transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(),
                                        output_dict["mask"].squeeze())
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion, output_transform=transform_fn, device=device),
            "pixacc":
            pixelwise_accuracy(n_classes,
                               output_transform=transform_fn,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=transform_fn,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=transform_fn,
                                device=device),
            "ciou":
            class_iou(n_classes, output_transform=transform_fn, device=device),
            "mIoU":
            mean_iou(n_classes, output_transform=transform_fn, device=device),
        },
        device=device,
    )

    # The model will be saved under: outputs/<config_file_name>/<model_dir>
    config_file_name = "default_config" if not cfg else cfg.split(
        "/")[-1].split(".")[0]
    try:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            git_branch(),
            git_hash(),
            config_file_name,
            config.TRAIN.MODEL_DIR,
            current_datetime(),
        )
    except:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            config_file_name,
            config.TRAIN.MODEL_DIR,
            current_datetime(),
        )

    if local_rank == 0:  # Run only on master process
        # Logging:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            logging_handlers.log_training_output(
                log_interval=config.PRINT_FREQ),
        )
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  logging_handlers.log_lr(optimizer))

        # Checkpointing: snapshotting trained models to disk
        checkpoint_handler = SnapshotHandler(
            output_dir,
            config.MODEL.NAME,
            extract_metric_from("mIoU"),
            lambda: (trainer.state.iteration % snapshot_duration) == 0,
        )

        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                    {"model": model})

        # Tensorboard and Logging:
        summary_writer = create_summary_writer(
            log_dir=path.join(output_dir, "logs"))
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"))
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_training_output(summary_writer))
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_validation_output(summary_writer))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        if local_rank == 0:  # Run only on master process
            tensorboard_handlers.log_results(engine,
                                             evaluator,
                                             summary_writer,
                                             n_classes,
                                             stage="Training")
            logging_handlers.log_metrics(engine, evaluator, stage="Training")
            logger.info("Logging training results..")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        if local_rank == 0:  # Run only on master process
            tensorboard_handlers.log_results(engine,
                                             evaluator,
                                             summary_writer,
                                             n_classes,
                                             stage="Validation")
            logging_handlers.log_metrics(engine, evaluator, stage="Validation")
            logger.info("Logging validation results..")
            # dump validation set metrics at the very end for debugging purposes
            if engine.state.epoch == config.TRAIN.END_EPOCH and debug:
                fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
                metrics = evaluator.state.metrics
                out_dict = {
                    x: metrics[x]
                    for x in ["nll", "pixacc", "mca", "mIoU"]
                }
                with open(fname, "w") as fid:
                    json.dump(out_dict, fid)
                log_msg = " ".join(f"{k}: {out_dict[k]}"
                                   for k in out_dict.keys())
                logging.info(log_msg)

    logger.info("Starting training")
    trainer.run(train_loader,
                max_epochs=config.TRAIN.END_EPOCH,
                epoch_length=len(train_loader),
                seed=config.SEED)
    if local_rank == 0:
        summary_writer.close()
コード例 #9
0
def _evaluate_split(
    split,
    section_aug,
    model,
    pre_processing,
    output_processing,
    device,
    running_metrics_overall,
    config,
    debug=False,
):
    logger = logging.getLogger(__name__)

    TestSectionLoader = get_test_loader(config)

    test_set = TestSectionLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split=split,
        is_transform=True,
        augmentations=section_aug,
        debug=debug,
    )

    n_classes = test_set.n_classes

    test_loader = data.DataLoader(test_set,
                                  batch_size=1,
                                  num_workers=config.WORKERS,
                                  shuffle=False)

    if debug:
        logger.info("Running in Debug/Test mode")
        test_loader = take(2, test_loader)

    try:
        output_dir = generate_path(
            f"debug/{config.OUTPUT_DIR}_test_{split}",
            git_branch(),
            git_hash(),
            config.MODEL.NAME,
            current_datetime(),
        )
    except:
        output_dir = generate_path(
            f"debug/{config.OUTPUT_DIR}_test_{split}",
            config.MODEL.NAME,
            current_datetime(),
        )

    running_metrics_split = runningScore(n_classes)

    # evaluation mode:
    with torch.no_grad():  # operations inside don't track history
        model.eval()
        total_iteration = 0
        for i, (images, labels) in enumerate(test_loader):
            logger.info(f"split: {split}, section: {i}")
            total_iteration = total_iteration + 1

            outputs = _patch_label_2d(
                model,
                images,
                pre_processing,
                output_processing,
                config.TRAIN.PATCH_SIZE,
                config.TEST.TEST_STRIDE,
                config.VALIDATION.BATCH_SIZE_PER_GPU,
                device,
                n_classes,
                split,
                debug,
            )

            pred = outputs.detach().max(1)[1].numpy()
            gt = labels.numpy()
            running_metrics_split.update(gt, pred)
            running_metrics_overall.update(gt, pred)

            #  dump images to disk for review
            mask_to_disk(pred.squeeze(),
                         os.path.join(output_dir, f"{i}_pred.png"), n_classes)
            mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"),
                         n_classes)

    # get scores
    score, class_iou = running_metrics_split.get_scores()

    # Log split results
    logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}')
    if debug:
        for cdx in range(n_classes):
            logger.info(
                f'  Class_{cdx}_accuracy {score["Class Accuracy: "][cdx]:.3f}')
    else:
        for cdx, class_name in enumerate(_CLASS_NAMES):
            logger.info(
                f'  {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}'
            )

    logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}')
    logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}')
    logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}')
    running_metrics_split.reset()