コード例 #1
0
def test_miou():

    y_true, y_pred = get_y_true_y_pred()
    th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)

    true_res = [0, 0, 0]
    for index in range(3):
        bin_y_true = y_true == index
        bin_y_pred = y_pred == index
        intersection = bin_y_true & bin_y_pred
        union = bin_y_true | bin_y_pred
        true_res[index] = intersection.sum() / union.sum()

    true_res_ = np.mean(true_res)

    cm = ConfusionMatrix(num_classes=3)
    iou_metric = mIoU(cm)

    # Update metric
    output = (th_y_logits, th_y_true)
    cm.update(output)

    res = iou_metric.compute().numpy()

    assert res == true_res_

    for ignore_index in range(3):
        cm = ConfusionMatrix(num_classes=3)
        iou_metric = mIoU(cm, ignore_index=ignore_index)
        # Update metric
        output = (th_y_logits, th_y_true)
        cm.update(output)
        res = iou_metric.compute().numpy()
        true_res_ = np.mean(true_res[:ignore_index] + true_res[ignore_index + 1 :])
        assert res == true_res_, f"{ignore_index}: {res} vs {true_res_}"
コード例 #2
0
def eval_model(model, val_loader, device='cpu', num_classes=21):
    def evaluate_function(engine, batch):
        model.eval()
        with torch.no_grad():
            img, mask = batch
            img = img.to(device)
            mask = mask.to(device)
            mask_pred = model(img)
            try:
                mask_pred = mask_pred['out']
            except:
                print('')
            return mask_pred, mask

    val_evaluator = Engine(evaluate_function)
    cm = ConfusionMatrix(num_classes=num_classes)
    mIoU(cm).attach(val_evaluator, 'mean IoU')
    Accuracy().attach(val_evaluator, "accuracy")
    Loss(loss_fn=nn.CrossEntropyLoss())\
    .attach(val_evaluator, "CE Loss")

    state = val_evaluator.run(val_loader)
    #print("mIoU :",state.metrics['mean IoU'])
    #print("Accuracy :",state.metrics['accuracy'])
    #print("CE Loss :",state.metrics['CE Loss'])

    return state
コード例 #3
0
def make_engine(process_function):
    evaluator = Engine(process_function)

    cm = ConfusionMatrix(num_classes=getattr(
        datasets, CONFIG["dataset"]["name"]).N_LABELS,
                         output_transform=output_transform)
    IoU(cm, ignore_index=0).attach(evaluator, 'IoU')
    mIoU(cm, ignore_index=0).attach(evaluator, 'mIoU')
    Accuracy(output_transform=output_transform).attach(evaluator, 'Accuracy')
    cmAccuracy(cm, ignore_index=0).attach(evaluator, 'ClasswiseAccuracy')

    return evaluator
コード例 #4
0
def evaluation(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    device = idist.device()
    manual_seed(config.seed + local_rank)

    data_loader = config.data_loader
    model = config.model.to(device)

    # Load weights:
    state_dict = get_model_weights(config, logger, with_clearml)
    model.load_state_dict(state_dict)

    # Adapt model to dist config
    model = idist.auto_model(model)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model,
                                 val_metrics,
                                 config,
                                 with_clearml,
                                 tag="val")

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.TensorboardLogger(
            log_dir=config.output_path.as_posix())
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.COMPLETED,
            tag="validation",
            metric_names="all",
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm,
                                    cm_metric, evaluator.state.iteration)

    state = evaluator.run(data_loader)
    utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation",
                      state.metrics)

    if idist.get_rank() == 0:
        tb_logger.close()
コード例 #5
0
def step_train_supervised(model,
                          train_loader,
                          criterion,
                          optimizer,
                          device='cpu',
                          num_classes=21):
    """
        A step of fully supervised segmentation model training.
    """
    def train_function(engine, batch):
        optimizer.zero_grad()
        model.train()
        img, mask = batch
        img = img.to(device)
        mask = mask.to(device)
        mask_pred = model(img)
        try:
            mask_pred = mask_pred['out']
        except:
            print('')
        #print(mask_pred)
        #print("UNIQUE",torch.unique(mask_pred.argmax(dim=1)))
        #print("SIZE",mask_pred.size())
        loss = criterion(mask_pred, mask)
        loss.backward()
        optimizer.step()

        return mask_pred, mask

    #print(num_classes)

    train_engine = Engine(train_function)
    cm = ConfusionMatrix(
        num_classes=num_classes)  #,output_transform=output_transform)
    mIoU(cm).attach(train_engine, 'mean IoU')
    Accuracy().attach(train_engine, "accuracy")
    Loss(loss_fn=nn.CrossEntropyLoss()).attach(train_engine, "CE Loss")
    state = train_engine.run(train_loader)
    #print("mIoU :",state.metrics['mean IoU'])
    #print("Accuracy :",state.metrics['accuracy'])
    #print("CE Loss :",state.metrics['CE Loss'])

    return state
コード例 #6
0
                                 [3, 2, 1, 1],
                                 [3, 2, 1, 2],
                                 [1, 1, 0, 0]])
        mask = torch.LongTensor([[1, 1, 0, 1],
                                 [1, 2, 1, 0],
                                 [3, 1, 2, 2],
                                 [3, 1, 0, 0]])
        pred = make_one_hot(pred, 4)
        return pred[0], mask

model = nn.Sequential()
criterion =nn.BCELoss()
from torch.optim import Adam
device = torch.device('cpu')
cm = ConfusionMatrix(num_classes=4)
miou=mIoU(cm, ignore_index=0)
iou = IoU(cm,ignore_index=0)
metric = {
    'mIOU':miou,
    'IOU':iou
}
evaluator =create_supervised_evaluator (model,metric,device=device)
data_loader = DataLoader(Data())
evaluator.run(data_loader)
print(evaluator.state.output)



state = evaluator.run(data_loader)
print(state.metrics['mIOU'])
print(state.metrics['IOU'])
コード例 #7
0
def inference(config, local_rank, with_pbar_on_iters=True):

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    # Load model and weights
    model_weights_filepath = Path(
        get_artifact_path(config.run_uuid, config.weights_filename))
    assert model_weights_filepath.exists(), \
        "Model weights file '{}' is not found".format(model_weights_filepath.as_posix())

    model = config.model.to(device)
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[local_rank],
                                                      output_device=local_rank)

    if hasattr(config, "custom_weights_loading"):
        config.custom_weights_loading(model, model_weights_filepath)
    else:
        state_dict = torch.load(model_weights_filepath)
        if not all([k.startswith("module.") for k in state_dict]):
            state_dict = {f"module.{k}": v for k, v in state_dict.items()}
        model.load_state_dict(state_dict)

    model.eval()

    prepare_batch = config.prepare_batch
    non_blocking = getattr(config, "non_blocking", True)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    tta_transforms = getattr(config, "tta_transforms", None)

    def eval_update_function(engine, batch):
        with torch.no_grad():
            x, y, meta = prepare_batch(batch,
                                       device=device,
                                       non_blocking=non_blocking)

            if tta_transforms is not None:
                y_preds = []
                for t in tta_transforms:
                    t_x = t.augment_image(x)
                    t_y_pred = model(t_x)
                    t_y_pred = model_output_transform(t_y_pred)
                    y_pred = t.deaugment_mask(t_y_pred)
                    y_preds.append(y_pred)

                y_preds = torch.stack(y_preds, dim=0)
                y_pred = torch.mean(y_preds, dim=0)
            else:
                y_pred = model(x)
                y_pred = model_output_transform(y_pred)
            return {"y_pred": y_pred, "y": y, "meta": meta}

    evaluator = Engine(eval_update_function)

    has_targets = getattr(config, "has_targets", False)

    if has_targets:

        def output_transform(output):
            return output['y_pred'], output['y']

        num_classes = config.num_classes
        cm_metric = ConfusionMatrix(num_classes=num_classes,
                                    output_transform=output_transform)
        pr = cmPrecision(cm_metric, average=False)
        re = cmRecall(cm_metric, average=False)

        val_metrics = {
            "IoU": IoU(cm_metric),
            "mIoU_bg": mIoU(cm_metric),
            "Accuracy": cmAccuracy(cm_metric),
            "Precision": pr,
            "Recall": re,
            "F1": Fbeta(beta=1.0, output_transform=output_transform)
        }

        if hasattr(config, "metrics") and isinstance(config.metrics, dict):
            val_metrics.update(config.metrics)

        for name, metric in val_metrics.items():
            metric.attach(evaluator, name)

        if dist.get_rank() == 0:
            # Log val metrics:
            mlflow_logger = MLflowLogger()
            mlflow_logger.attach(evaluator,
                                 log_handler=OutputHandler(
                                     tag="validation",
                                     metric_names=list(val_metrics.keys())),
                                 event_name=Events.EPOCH_COMPLETED)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=True, desc="Inference").attach(evaluator)

    if dist.get_rank() == 0:
        do_save_raw_predictions = getattr(config, "do_save_raw_predictions",
                                          True)
        do_save_overlayed_predictions = getattr(
            config, "do_save_overlayed_predictions", True)

        if not has_targets:
            assert do_save_raw_predictions or do_save_overlayed_predictions, \
                "If no targets, either do_save_overlayed_predictions or do_save_raw_predictions should be " \
                "defined in the config and has value equal True"

        # Save predictions
        if do_save_raw_predictions:
            raw_preds_path = config.output_path / "raw"
            raw_preds_path.mkdir(parents=True)

            evaluator.add_event_handler(Events.ITERATION_COMPLETED,
                                        save_raw_predictions_with_geoinfo,
                                        raw_preds_path)

        if do_save_overlayed_predictions:
            overlayed_preds_path = config.output_path / "overlay"
            overlayed_preds_path.mkdir(parents=True)

            evaluator.add_event_handler(
                Events.ITERATION_COMPLETED,
                save_overlayed_predictions,
                overlayed_preds_path,
                img_denormalize_fn=config.img_denormalize,
                palette=default_palette)

    evaluator.add_event_handler(Events.EXCEPTION_RAISED, report_exception)

    # Run evaluation
    evaluator.run(config.data_loader)
コード例 #8
0
def run(train_config, logger, **kwargs):

    logger = logging.getLogger('UDA')
    if getattr(train_config, 'debug', False):
        setup_logger(logger, logging.DEBUG)

    # Set Polyaxon environment if needed
    plx_logger = None
    save_dir = None
    output_experiment_path = None
    try:
        plx_logger = PolyaxonLogger()
        experiment = plx_logger.experiment
        save_dir = get_outputs_path()
        output_experiment_path = get_outputs_refs_paths()
        output_experiment_path = output_experiment_path['experiments'][
            0] if output_experiment_path else None
        logger.debug("Experiment info: {}".format(
            experiment.get_experiment_info()))
    except PolyaxonClientException as e:
        logger.warning('Logger Polyaxon : ' + str(e))

    # Path configuration
    saves_dict = getattr(train_config, 'saves', {})

    save_dir = saves_dict.get('save_dir', '') if save_dir is None else save_dir
    log_dir = os.path.join(save_dir, saves_dict.get('log_dir', ''))
    save_model_dir = os.path.join(save_dir, saves_dict.get('model_dir', ''))
    save_prediction_dir = os.path.join(save_dir,
                                       saves_dict.get('prediction_dir', ''))
    save_config_dir = os.path.join(save_dir, saves_dict.get('config_dir', ''))
    load_model_file = saves_dict.get('load_model_file', '')
    load_optimizer_file = saves_dict.get('load_optimizer_file', '')

    # Create folders
    create_save_folders(save_dir, saves_dict)

    if output_experiment_path is not None:
        model_dir = saves_dict.get('model_dir', '')
        load_model_file = os.path.join(
            output_experiment_path, model_dir,
            load_model_file) if load_model_file else None
        load_optimizer_file = os.path.join(
            output_experiment_path, model_dir,
            load_optimizer_file) if load_optimizer_file else None

    num_epochs = getattr(train_config, 'num_epochs')
    num_classes = getattr(train_config, 'num_classes')
    device = getattr(train_config, 'device', 'cpu')

    # Set magical acceleration
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
    else:
        assert device == 'cpu', 'CUDA device selected but none is available'

    # Set half precision if required
    use_fp_16 = getattr(train_config, 'use_fp_16', False)

    train1_sup_loader = getattr(train_config, 'train1_sup_loader')
    train1_unsup_loader = getattr(train_config, 'train1_unsup_loader')
    train2_unsup_loader = getattr(train_config, 'train2_unsup_loader')
    test_loader = getattr(train_config, 'test_loader')

    save_interval = saves_dict.get('save_interval', 0)
    n_saved = saves_dict.get('n_saved', 0)

    val_interval = getattr(train_config, 'val_interval', 1)
    pred_interval = getattr(train_config, 'pred_interval', 0)

    model = getattr(train_config, 'model').to(device)

    optimizer = getattr(train_config, 'optimizer')

    criterion = getattr(train_config, 'criterion').to(device)
    consistency_criterion = getattr(train_config,
                                    'consistency_criterion').to(device)

    cm_metric = getattr(
        train_config, 'cm_metric',
        ConfusionMatrix(num_classes=num_classes,
                        output_transform=lambda x: (x['y_pred'], x['y'])))

    # AMP initialization for half precision
    if use_fp_16:
        assert 'cuda' in device
        assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
        try:
            from apex import amp
        except:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to run this example."
            )
        # Initialize amp
        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    # Load checkpoint
    load_params(model,
                optimizer=optimizer,
                model_file=load_model_file,
                optimizer_file=load_optimizer_file,
                device_name=device)

    # Add batch norm
    is_bn = getattr(train_config, 'is_bn', False)
    if is_bn:
        batch_norm = nn.BatchNorm2d(3).to(device)
        if use_fp_16:
            batch_norm = amp.initialize(batch_norm)
        batch_norm.reset_parameters()
        model = nn.Sequential(batch_norm, model)

    # Copy the config file
    shutil.copy2(os.path.abspath(train_config.__file__),
                 os.path.join(save_config_dir, 'checkpoint_module.py'))

    le = len(train1_sup_loader)
    num_train_steps = le * num_epochs
    mlflow.log_param("num train steps", num_train_steps)

    lr = getattr(train_config, 'learning_rate')
    num_warmup_steps = getattr(train_config, 'num_warmup_steps', 0)

    lr_scheduler = getattr(train_config, 'lr_scheduler', None)
    if lr_scheduler is not None:
        lr_scheduler = lr_scheduler(optimizer)

    if num_warmup_steps > 0:
        lr_scheduler = create_lr_scheduler_with_warmup(
            lr_scheduler,
            warmup_start_value=0.0,
            warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps),
            warmup_duration=num_warmup_steps)

    train1_sup_loader_iter = cycle(train1_sup_loader)
    train1_unsup_loader_iter = cycle(train1_unsup_loader)
    train2_unsup_loader_iter = cycle(train2_unsup_loader)

    # Reduce on plateau
    reduce_on_plateau = getattr(train_config, 'reduce_on_plateau', None)

    # Output transform model
    output_transform_model = getattr(train_config, 'output_transform_model',
                                     lambda x: x)

    inference_fn = getattr(train_config, 'inference_fn', inference_standard)

    lam = getattr(train_config, 'consistency_lambda')
    beta = getattr(train_config, 'consistency_beta', lam)

    tsa = TrainingSignalAnnealing(
        num_steps=num_train_steps,
        min_threshold=getattr(train_config, 'TSA_proba_min'),
        max_threshold=getattr(train_config, 'TSA_proba_max'))

    with_tsa = getattr(train_config, 'with_TSA', False)

    cfg = {
        'tsa': tsa,
        'lambda': lam,
        'beta': beta,
        'with_tsa': with_tsa,
        'device': device,
        'consistency_criterion': consistency_criterion,
        'criterion': criterion
    }

    trainer = Engine(
        partial(train_update_function,
                model=model,
                optimizer=optimizer,
                cfg=cfg,
                train1_sup_loader_iter=train1_sup_loader_iter,
                train1_unsup_loader_iter=train1_unsup_loader_iter,
                train2_unsup_loader_iter=train2_unsup_loader_iter,
                output_transform_model=output_transform_model,
                use_fp_16=use_fp_16))

    # Register events
    for e in CustomEvents:
        State.event_to_attr[e] = 'iteration'

    trainer.register_events(*CustomEvents)

    if with_tsa:
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_tsa, tsa)

    if lr_scheduler is not None:
        if not hasattr(lr_scheduler, "step"):
            trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
        else:
            trainer.add_event_handler(Events.ITERATION_STARTED,
                                      lambda engine: lr_scheduler.step())

    trainer.add_event_handler(Events.ITERATION_COMPLETED, log_learning_rate,
                              optimizer)

    metric_names = [
        'supervised batch loss', 'consistency batch loss', 'final batch loss'
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        RunningAverage(
            output_transform=partial(output_transform, name=n)).attach(
                trainer, n)

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    # Handlers for Tensorboard logging
    tb_logger = TensorboardLogger(log_dir=log_dir)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=metric_names),
                     event_name=CustomEvents.ITERATION_K_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=CustomEvents.ITERATION_K_STARTED)

    # Handlers for Polyaxon logging
    if plx_logger is not None:
        plx_logger.attach(trainer,
                          log_handler=plxOutputHandler(
                              tag="train", metric_names=metric_names),
                          event_name=CustomEvents.ITERATION_K_COMPLETED)

    metrics = {
        'loss': Loss(criterion,
                     output_transform=lambda x: (x['y_pred'], x['y'])),
        'mAcc': cmAccuracy(cm_metric).mean(),
        'mPr': cmPrecision(cm_metric).mean(),
        'mRe': cmRecall(cm_metric).mean(),
        'mIoU': mIoU(cm_metric),
        'mF1': cmFbeta(cm_metric, 1).mean()
    }
    iou = IoU(cm_metric)
    for i in range(num_classes):
        key_name = 'IoU_{}'.format(str(i))
        metrics[key_name] = iou[i]

    inference_update_fn = partial(
        inference_update_function,
        model=model,
        cfg=cfg,
        output_transform_model=output_transform_model,
        inference_fn=inference_fn)

    evaluator = Engine(inference_update_fn)
    train_evaluator = Engine(inference_update_fn)

    for name, metric in metrics.items():
        metric.attach(train_evaluator, name)
        metric.attach(evaluator, name)

    # Add checkpoint
    if save_model_dir:
        checkpoint = ModelCheckpoint(dirname=save_model_dir,
                                     filename_prefix='checkpoint',
                                     save_interval=save_interval,
                                     n_saved=n_saved,
                                     create_dir=True)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint, {
            'mymodel': model,
            'optimizer': optimizer
        })

    def trigger_k_iteration_started(engine, k):
        if engine.state.iteration % k == 0:
            engine.fire_event(CustomEvents.ITERATION_K_STARTED)

    def trigger_k_iteration_completed(engine, k):
        if engine.state.iteration % k == 0:
            engine.fire_event(CustomEvents.ITERATION_K_COMPLETED)

    def run_validation(engine, validation_interval):
        if (trainer.state.epoch - 1) % validation_interval == 0:
            train_evaluator.run(train1_sup_loader)
            evaluator.run(test_loader)

            if save_prediction_dir:
                train_output = train_evaluator.state.output
                test_output = evaluator.state.output

                iteration = str(trainer.state.iteration)
                epoch = str(trainer.state.epoch)

                save_prediction('train_{}_{}'.format(iteration, epoch),
                                save_prediction_dir,
                                train_output['x'],
                                torch.argmax(
                                    train_output['y_pred'][0, :, :, :], dim=0),
                                y=train_output['y'][0, :, :])

                save_prediction('test_{}_{}'.format(iteration, epoch),
                                save_prediction_dir,
                                test_output['x'],
                                torch.argmax(test_output['y_pred'][0, :, :, :],
                                             dim=0),
                                y=test_output['y'][0, :, :])

            train_evaluator.state.output = None
            evaluator.state.output = None

            if reduce_on_plateau is not None:
                reduce_on_plateau.step(evaluator.state.metrics['mIoU'])

    trainer.add_event_handler(Events.ITERATION_STARTED,
                              trigger_k_iteration_started,
                              k=10)
    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              trigger_k_iteration_completed,
                              k=10)

    trainer.add_event_handler(Events.EPOCH_STARTED,
                              run_validation,
                              validation_interval=val_interval)
    trainer.add_event_handler(Events.COMPLETED,
                              run_validation,
                              validation_interval=1)

    def trainer_prediction_save(engine, prediction_interval):
        if (engine.state.iteration - 1) % prediction_interval == 0:

            if save_prediction_dir:
                trainer_output = trainer.state.output['unsup pred']

                iteration = str(trainer.state.iteration)
                epoch = str(trainer.state.epoch)

                save_prediction('trainer_{}_{}'.format(iteration, epoch),
                                save_prediction_dir, trainer_output['x'],
                                trainer_output['y_pred'])

                logger.debug(
                    'Saved trainer prediction for iteration {}'.format(
                        str(engine.state.iteration)))

            trainer.state.output = None

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              trainer_prediction_save,
                              prediction_interval=pred_interval)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=list(
                                                     metrics.keys())),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names=list(
                                                     metrics.keys())),
                     event_name=Events.EPOCH_COMPLETED)

    # Handlers for Polyaxon logging
    if plx_logger is not None:
        plx_logger.attach(train_evaluator,
                          log_handler=plxOutputHandler(tag="train",
                                                       metric_names=list(
                                                           metrics.keys())),
                          event_name=Events.EPOCH_COMPLETED)

        plx_logger.attach(evaluator,
                          log_handler=plxOutputHandler(tag="test",
                                                       metric_names=list(
                                                           metrics.keys())),
                          event_name=Events.EPOCH_COMPLETED)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train", trainer)
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train",
                                      trainer)
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test", trainer)

    data_steps = list(range(len(train1_sup_loader)))

    logger.debug('Start training')
    trainer.run(data_steps, max_epochs=num_epochs)
    logger.debug('Finished training')
コード例 #9
0
def run(args):
    train_loader, val_loader = get_data_loaders(args.dir, args.batch_size,
                                                args.num_workers)

    if args.seed is not None:
        torch.manual_seed(args.seed)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    num_classes = CityscapesDataset.num_instance_classes() + 1
    model = models.box2pix(num_classes)
    model.init_from_googlenet()

    if torch.cuda.device_count() > 1:
        print("Using %d GPU(s)" % torch.cuda.device_count())
        model = nn.DataParallel(model)

    model = model.to(device)

    semantics_criterion = nn.CrossEntropyLoss(ignore_index=255)
    offsets_criterion = nn.MSELoss()
    box_criterion = BoxLoss(num_classes, gamma=2)
    multitask_criterion = MultiTaskLoss().to(device)

    box_coder = BoxCoder()
    optimizer = optim.Adam([{
        'params': model.parameters()
    }, {
        'params': multitask_criterion.parameters()
    }],
                           lr=args.lr)

    if args.resume:
        if os.path.isfile(args.resume):
            print("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            multitask_criterion.load_state_dict(checkpoint['multitask'])
            print("Loaded checkpoint '{}' (Epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))

    def _prepare_batch(batch, non_blocking=True):
        x, instance, boxes, labels = batch

        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(instance,
                               device=device,
                               non_blocking=non_blocking),
                convert_tensor(boxes, device=device,
                               non_blocking=non_blocking),
                convert_tensor(labels,
                               device=device,
                               non_blocking=non_blocking))

    def _update(engine, batch):
        model.train()
        optimizer.zero_grad()
        x, instance, boxes, labels = _prepare_batch(batch)
        boxes, labels = box_coder.encode(boxes, labels)

        loc_preds, conf_preds, semantics_pred, offsets_pred = model(x)

        semantics_loss = semantics_criterion(semantics_pred, instance)
        offsets_loss = offsets_criterion(offsets_pred, instance)
        box_loss, conf_loss = box_criterion(loc_preds, boxes, conf_preds,
                                            labels)

        loss = multitask_criterion(semantics_loss, offsets_loss, box_loss,
                                   conf_loss)

        loss.backward()
        optimizer.step()

        return {
            'loss': loss.item(),
            'loss_semantics': semantics_loss.item(),
            'loss_offsets': offsets_loss.item(),
            'loss_ssdbox': box_loss.item(),
            'loss_ssdclass': conf_loss.item()
        }

    trainer = Engine(_update)

    checkpoint_handler = ModelCheckpoint(args.output_dir,
                                         'checkpoint',
                                         save_interval=1,
                                         n_saved=10,
                                         require_empty=False,
                                         create_dir=True,
                                         save_as_state_dict=False)
    timer = Timer(average=True)

    # attach running average metrics
    train_metrics = [
        'loss', 'loss_semantics', 'loss_offsets', 'loss_ssdbox',
        'loss_ssdclass'
    ]
    for m in train_metrics:
        transform = partial(lambda x, metric: x[metric], metric=m)
        RunningAverage(output_transform=transform).attach(trainer, m)

    # attach progress bar
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=train_metrics)

    checkpoint = {
        'model': model.state_dict(),
        'epoch': trainer.state.epoch,
        'optimizer': optimizer.state_dict(),
        'multitask': multitask_criterion.state_dict()
    }
    trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                              handler=checkpoint_handler,
                              to_save={'checkpoint': checkpoint})

    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED,
                 step=Events.ITERATION_COMPLETED)

    def _inference(engine, batch):
        model.eval()
        with torch.no_grad():
            x, instance, boxes, labels = _prepare_batch(batch)
            loc_preds, conf_preds, semantics, offsets_pred = model(x)
            boxes_preds, labels_preds, scores_preds = box_coder.decode(
                loc_preds, F.softmax(conf_preds, dim=1), score_thresh=0.01)

            semantics_loss = semantics_criterion(semantics, instance)
            offsets_loss = offsets_criterion(offsets_pred, instance)
            box_loss, conf_loss = box_criterion(loc_preds, boxes, conf_preds,
                                                labels)

            semantics_pred = semantics.argmax(dim=1)
            instances = helper.assign_pix2box(semantics_pred, offsets_pred,
                                              boxes_preds, labels_preds)

        return {
            'loss': (semantics_loss, offsets_loss, {
                'box_loss': box_loss,
                'conf_loss': conf_loss
            }),
            'objects':
            (boxes_preds, labels_preds, scores_preds, boxes, labels),
            'semantics':
            semantics_pred,
            'instances':
            instances
        }

    train_evaluator = Engine(_inference)
    cm = ConfusionMatrix(num_classes=num_classes,
                         output_transform=lambda x: x['semantics'])
    mIoU(cm, ignore_index=0).attach(train_evaluator, 'mIoU')
    Loss(multitask_criterion,
         output_transform=lambda x: x['loss']).attach(train_evaluator, 'loss')
    MeanAveragePrecision(num_classes,
                         output_transform=lambda x: x['objects']).attach(
                             train_evaluator, 'mAP')

    evaluator = Engine(_inference)
    cm2 = ConfusionMatrix(num_classes=num_classes,
                          output_transform=lambda x: x['semantics'])
    mIoU(cm2, ignore_index=0).attach(train_evaluator, 'mIoU')
    Loss(multitask_criterion,
         output_transform=lambda x: x['loss']).attach(evaluator, 'loss')
    MeanAveragePrecision(num_classes,
                         output_transform=lambda x: x['objects']).attach(
                             evaluator, 'mAP')

    tb_logger = TensorboardLogger(args.log_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(
                         tag='training',
                         output_transform=lambda loss: {
                             'loss': loss['loss'],
                             'loss_semantics': loss['loss_semantics'],
                             'loss_offsets': loss['loss_offsets'],
                             'loss_ssdbox': loss['loss_ssdbox'],
                             'loss_ssdclass': loss['loss_ssdclass']
                         }),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(train_evaluator,
                     log_handler=OutputHandler(
                         tag='training_eval',
                         metric_names=['loss', 'mAP', 'mIoU'],
                         output_transform=lambda loss: {
                             'loss': loss['loss'],
                             'objects': loss['objects'],
                             'semantics': loss['semantics']
                         },
                         another_engine=trainer),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(
                         tag='validation_eval',
                         metric_names=['loss', 'mAP', 'mIoU'],
                         output_transform=lambda loss: {
                             'loss': loss['loss'],
                             'objects': loss['objects'],
                             'semantics': loss['semantics']
                         },
                         another_engine=trainer),
                     event_name=Events.EPOCH_COMPLETED)

    @trainer.on(Events.STARTED)
    def initialize(engine):
        if args.resume:
            engine.state.epoch = args.start_epoch

    @trainer.on(Events.EPOCH_COMPLETED)
    def print_times(engine):
        pbar.log_message(
            "Epoch [{}/{}] done. Time per batch: {:.3f}[s]".format(
                engine.state.epoch, engine.state.max_epochs, timer.value()))
        timer.reset()

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(train_loader)
        metrics = train_evaluator.state.metrics
        loss = metrics['loss']
        mean_ap = metrics['mAP']
        iou = metrics['mIoU']

        pbar.log_message(
            'Training results - Epoch: [{}/{}]: Loss: {:.4f}, mAP(50%): {:.1f}, IoU: {:.1f}'
            .format(loss, evaluator.state.epochs, evaluator.state.max_epochs,
                    mean_ap, iou * 100.0))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        loss = metrics['loss']
        mean_ap = metrics['mAP']
        iou = metrics['mIoU']

        pbar.log_message(
            'Validation results - Epoch: [{}/{}]: Loss: {:.4f}, mAP(50%): {:.1f}, IoU: {:.1f}'
            .format(loss, evaluator.state.epochs, evaluator.state.max_epochs,
                    mean_ap, iou * 100.0))

    @trainer.on(Events.EXCEPTION_RAISED)
    def handle_exception(engine, e):
        if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1):
            engine.terminate()
            warnings.warn("KeyboardInterrupt caught. Exiting gracefully.")

            checkpoint_handler(engine, {'model_exception': model})
        else:
            raise e

    @trainer.on(Events.COMPLETED)
    def save_final_model(engine):
        checkpoint_handler(engine, {'final': model})

    trainer.run(train_loader, max_epochs=args.epochs)
    tb_logger.close()
コード例 #10
0
ファイル: training.py プロジェクト: lavanyashukla/ignite
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)
    device = config.device

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    @trainer.on(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1))
        | Events.COMPLETED)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        config.output_path.as_posix(),
        evaluator,
        model=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = exp_tracking.setup_logging(trainer,
                                                         optimizer,
                                                         evaluators={
                                                             "training":
                                                             train_evaluator,
                                                             "validation":
                                                             evaluator
                                                         })

        # Log val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2),
        )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
コード例 #11
0
    optimizer = _optimizer(model.parameters(), lr = config["learning_rate"])

    trainer = engine.create_supervised_trainer(
        model = model,
        optimizer = optimizer,
        loss_fn = loss_fn,
        device = device,
        non_blocking = True,
    )

    evaluator = engine.create_supervised_evaluator(
        model = model,
        metrics={
            "Loss": metrics.Loss(nn.CrossEntropyLoss()),
            "[email protected]": metrics.Accuracy(thresholded_transform(0.3)),
            "[email protected]": metrics.Accuracy(thresholded_transform(0.3)), 
            "IOU": metrics.IoU(metrics.ConfusionMatrix(num_classes = config["n_classes"])),
            "mIOU": metrics.mIoU(metrics.ConfusionMatrix(num_classes = config["n_classes"])),
            # "FPS": metrics.Frequency(output_transform=lambda x: x[0]),
        },
        device = device,
        non_blocking=True,
        output_transform = lambda x, y, y_pred: (torch.sigmoid(y_pred["out"]), y),
    )

    writer = tensorboard.SummaryWriter(log_dir=f'summary/{config["model_tag"]}')
    attach_metric_logger(evaluator, eval_loader, 'val', writer=writer)
    attach_training_logger(trainer, writer=writer, log_interval=1)
    attach_model_checkpoint(trainer, {config["model_tag"]: model.module}, args.name)

    trainer.run(finetune_loader, max_epochs=config["epochs"])
コード例 #12
0
def training(local_rank, config, logger=None):
    #
    # if not getattr(config, "use_fp16", True):
    #     raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    val_interval = getattr(config, "val_interval", 1)

    @trainer.on(Events.EPOCH_COMPLETED(every=val_interval))
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if config.num_epochs % val_interval != 0:
        trainer.add_event_handler(Events.COMPLETED, run_validation)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = tracking.setup_logging(trainer,
                                                     optimizer,
                                                     evaluators={
                                                         "training":
                                                         train_evaluator,
                                                         "validation":
                                                         evaluator
                                                     })

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) *
                                        3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(
                event_filter=custom_event_filter),
        )

    # Log confusion matrix to Trains:

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
コード例 #13
0
    testdataset = AerialDataset("val", dataset, img_path, label_path)
    testloader = DataLoader(
        testdataset,
        batch_size=16,
        pin_memory=True,
        drop_last=True,
    )

    evaluator = engine.create_supervised_evaluator(
        model=model,
        metrics={
            "[email protected]": metrics.Accuracy(thresholded_transform(0.3)),
            "[email protected]": metrics.Accuracy(thresholded_transform(0.3)),
            "IOU": metrics.IoU(metrics.ConfusionMatrix(num_classes=n_classes)),
            "mIOU":
            metrics.mIoU(metrics.ConfusionMatrix(num_classes=n_classes)),
            # "FPS": metrics.Frequency(output_transform=lambda x: x[0]),
        },
        device=device,
        non_blocking=True,
        output_transform=lambda x, y, y_pred:
        (torch.sigmoid(y_pred["out"]), y),
    )
    writer = tensorboard.SummaryWriter(
        log_dir=f'summary/{config["model_tag"]}')

    attach_metric_logger(evaluator, 'val', writer)
    evaluator.run(testloader)

    getFlopsandParams(model, testdataset[0][0].unsqueeze(0))
コード例 #14
0
}

clear_cuda()
model = UNet(in_channels=params['in_channels'],
             n_classes=params['n_classes'],
             depth=params['depth'])

model.to(device)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])

# Determine metrics for evaluation.
metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion),
    "mean_iou": mIoU(ConfusionMatrix(num_classes=params['n_classes'])),
}

# Create Trainer or Evaluators
trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
train_evaluator = create_supervised_evaluator(model,
                                              metrics=metrics,
                                              device=device)
validation_evaluator = create_supervised_evaluator(model,
                                                   metrics=metrics,
                                                   device=device)

trainer.logger = setup_logger("Trainer")
train_evaluator.logger = setup_logger("Train Evaluator")
validation_evaluator.logger = setup_logger("Validation Evaluator")
コード例 #15
0
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    unsup_train_loader = config.unsup_train_loader
    unsup_train_sampler = getattr(unsup_train_loader, "sampler", None)
    assert unsup_train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(unsup_train_loader))
    assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \
        "Unsupervised train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2)
    model = DDP(model, delay_allreduce=True)
    
    criterion = config.criterion.to(device)
    unsup_criterion = config.unsup_criterion.to(device)
    unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform", lambda x: x)

    def cycle(seq):
        while True:
            for i in seq:
                yield i

    unsup_train_loader_iter = cycle(unsup_train_loader)

    def supervised_loss(batch):
        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)
        return loss

    def unsupervised_loss(x):

        with torch.no_grad():
            y_pred_orig = model(x)

            # Data augmentation: geom only
            k = random.randint(1, 3)
            x_aug = torch.rot90(x, k=k, dims=(2, 3))
            y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3))
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(2, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) 
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(3, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) 

            y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long()

        y_pred_aug = model(x_aug.detach())

        loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach())

        return loss

    def train_update_function(engine, batch):
        model.train()

        loss = supervised_loss(batch)
        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}
        
        # Difference with original UDA
        # Apply separately grads from supervised/unsupervised parts
        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        unsup_batch = next(unsup_train_loader_iter)
        unsup_x = unsup_batch['image']
        unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking)

        for _ in range(unsup_batch_num_repetitions):
            unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x)

            assert isinstance(unsup_loss, torch.Tensor)
            output['unsupervised batch loss'] = unsup_loss.item()

            with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss:
                scaled_loss.backward()

            if engine.state.iteration % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        unsup_batch = None
        unsup_x = None

        total_loss = loss +  unsup_loss
        output['total batch loss'] = total_loss.item()

        return output

    output_names = getattr(config, "output_names", 
                           ['supervised batch loss', 'unsupervised batch loss', 'total batch loss'])

    trainer = Engine(train_update_function)

    @trainer.on(Events.STARTED)
    def init(engine):
        if hasattr(config, "unsup_lambda_min"):
            engine.state.unsup_lambda = config.unsup_lambda_min
        else:
            engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001)

    @trainer.on(Events.ITERATION_COMPLETED)
    def update_unsup_params(engine):        
        engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001)
        if hasattr(config, "unsup_lambda_max"):
            m = config.unsup_lambda_max
            engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m

    common.setup_common_distrib_training_handlers(
        trainer, train_sampler,
        to_save={'model': model, 'optimizer': optimizer},
        save_every_iters=1000, output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler, output_names=output_names,
        with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1
    )

    def output_transform(output):        
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(
        model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch,
        output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y}
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer,
                                            evaluators={"training": train_evaluator, "validation": evaluator})
        common.setup_mlflow_logging(trainer, optimizer,
                                    evaluators={"training": train_evaluator, "validation": evaluator})

        common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model,
                                            metric_name=score_metric_name, trainer=trainer)

        # Log unsup_lambda
        @trainer.on(Events.ITERATION_COMPLETED(every=100))
        def tblog_unsupervised_lambda(engine):
            tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration)
            mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration)

        # Log train/val predictions:
        tb_logger.attach(evaluator,
                         log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                   n_images=15,
                                                                   another_engine=trainer,
                                                                   prefix_tag="validation"),
                         event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                       n_images=15,
                                                                       another_engine=trainer,
                                                                       prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
コード例 #16
0
def training(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    manual_seed(config.seed + local_rank)

    train_loader = config.train_loader
    val_loader = config.val_loader
    train_eval_loader = config.train_eval_loader

    model, optimizer, criterion = utils.initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val")
    train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train")

    val_interval = config.get("val_interval", 1)

    # Run validation on every val_interval epoch, in the end of the training
    # and in the begining if config.start_by_validation is True
    event = Events.EPOCH_COMPLETED(every=val_interval)
    if config.num_epochs % val_interval != 0:
        event |= Events.COMPLETED
    if config.get("start_by_validation", False):
        event |= Events.STARTED

    @trainer.on(event)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(val_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    score_metric_name = "mIoU_bg"
    if "es_patience" in config:
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    # Store 2 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=2,
        trainer=trainer,
        tag="val",
    )

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={"training": train_evaluator, "validation": evaluator},
        )

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        # Image denormalization function to plot predictions with images
        mean = config.get("mean", (0.485, 0.456, 0.406))
        std = config.get("std", (0.229, 0.224, 0.225))
        img_denormalize = partial(data.denormalize, mean=mean, std=std)

        tb_logger.attach(
            evaluator,
            log_handler=vis.predictions_gt_images_handler(
                img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation",
            ),
            event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter),
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration)

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
コード例 #17
0
ファイル: training.py プロジェクト: vfdev-5/UNOSAT_Challenge
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)

    criterion = config.criterion.to(device)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):
        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(config, "output_names", [
        'supervised batch loss',
    ])

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        output_names=output_names,
        with_pbars=True,
        with_pbar_on_iters=with_pbar_on_iters,
        log_every_iters=1)

    def output_transform(output):
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes,
                                output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: {
                              'y_pred': model_output_transform(y_pred),
                              'y': y
                          })
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        common.setup_mlflow_logging(trainer,
                                    optimizer,
                                    evaluators={
                                        "training": train_evaluator,
                                        "validation": evaluator
                                    })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(
                                 img_denormalize_fn=config.img_denormalize,
                                 n_images=15,
                                 another_engine=trainer,
                                 prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(
                                 once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
コード例 #18
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    val_interval = getattr(config, "val_interval", 1)

    @trainer.on(Events.EPOCH_COMPLETED(every=val_interval))
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if config.num_epochs % val_interval != 0:
        trainer.add_event_handler(Events.COMPLETED, run_validation)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        if not exp_tracking.has_clearml:
            exp_tracking_logger = exp_tracking.setup_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                })

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) *
                                        3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(
                event_filter=custom_event_filter),
        )

    # Log confusion matrix to ClearML:
    if exp_tracking.has_clearml:

        @trainer.on(Events.COMPLETED)
        def compute_and_log_cm():
            cm = cm_metric.compute()
            # CM: values are normalized such that diagonal values represent class recalls
            cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy()

            if idist.get_rank() == 0:
                try:
                    from clearml import Task
                except ImportError:
                    # Backwards-compatibility for legacy Trains SDK
                    from trains import Task

                clearml_logger = Task.current_task().get_logger()
                clearml_logger.report_confusion_matrix(
                    title="Final Confusion Matrix",
                    series="cm-preds-gt",
                    matrix=cm,
                    iteration=trainer.state.iteration,
                    xlabels=VOCSegmentationOpencv.target_names,
                    ylabels=VOCSegmentationOpencv.target_names,
                )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        if not exp_tracking.has_clearml:
            exp_tracking_logger.close()
def train():
    # initiate command line arguments, configuration file and logging block
    args = parse_args()
    config = read_config()
    try:
        if args.overwrite:
            shutil.rmtree(f"./logs/{args.name}", ignore_errors=True)
        os.mkdir(f"./logs/{args.name}")
    except:
        print(f"log folder {args.name} already exits.")

    init_logging(log_path=f"./logs/{args.name}")

    # determine train model on which device, cuda or cpu
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info(f"running training on {device}")
    device += f':{args.main_cuda}'

    # prepare training and validation datasets
    logger.info('creating dataset and data loaders')
    dataset = args.dataset

    train_dataset = AerialDataset("train", dataset,
                                  config[dataset]["train"]["image_path"],
                                  config[dataset]["train"]["mask_path"])
    val_dataset = AerialDataset("val", dataset,
                                config[dataset]["val"]["image_path"],
                                config[dataset]["val"]["mask_path"])
    train_loader, train_metrics_loader, val_metrics_loader = create_data_loaders(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        num_workers=config["num_workers"],
        batch_size=config["batchsize"],
    )

    # create model
    logger.info(
        f'creating BiseNetv2 and optimizer with initial lr of {config["learning_rate"]}'
    )

    model = BiSeNetV2(config["n_classes"])
    model = nn.DataParallel(model,
                            device_ids=[x for x in range(args.main_cuda, 4)
                                        ]).to(device)

    # initiate loss function and optimizer
    optimizer_fn = init_optimizer(config)
    optimizer = optimizer_fn(model.parameters(), lr=config["learning_rate"])

    logger.info('creating trainer and evaluator engines')

    _loss_fn = init_loss(config["loss_fn"])
    loss_fn = LossWithAux(_loss_fn)

    # create trainer and evaluator wiht ignite.engine
    trainer = engine.create_supervised_trainer(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device=device,
        non_blocking=True,
    )

    evaluator = engine.create_supervised_evaluator(
        model=model,
        metrics={
            'loss':
            metrics.Loss(nn.CrossEntropyLoss()),
            "[email protected]":
            metrics.Accuracy(thresholded_transform(0.3)),
            "[email protected]":
            metrics.Accuracy(thresholded_transform(0.3)),
            "IOU":
            metrics.IoU(
                metrics.ConfusionMatrix(num_classes=config["n_classes"])),
            "mIOU":
            metrics.mIoU(
                metrics.ConfusionMatrix(num_classes=config["n_classes"])),
        },
        device=device,
        non_blocking=True,
        output_transform=lambda x, y, y_pred:
        (torch.sigmoid(y_pred["out"]), y),
    )

    # attach event listener to do post process after each iteration and epoch

    logger.info(f'creating summary writer with tag {config["model_tag"]}')
    writer = tensorboard.SummaryWriter(log_dir=f'logs/{config["model_tag"]}')

    # logger.info('attaching lr scheduler')
    # lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    # attach_lr_scheduler(trainer, lr_scheduler, writer)

    logger.info('attaching event driven calls')
    attach_model_checkpoint(trainer, {config["model_tag"]: model.module},
                            args.name)
    attach_training_logger(trainer, writer=writer)

    attach_metric_logger(trainer, evaluator, 'train', train_metrics_loader,
                         writer)
    attach_metric_logger(trainer, evaluator, 'val', val_metrics_loader, writer)

    # start training (evaluation is included too)
    logger.info('training...')
    trainer.run(train_loader, max_epochs=config["epochs"])