Esempio n. 1
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = "cuda"

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert (
        train_sampler is not None
    ), "Train loader of type '{}' " "should have attribute 'sampler'".format(
        type(train_loader))
    assert hasattr(train_sampler, "set_epoch") and callable(
        train_sampler.set_epoch
    ), "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=getattr(config, "fp16_opt_level", "O2"),
        num_losses=1,
    )
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert "supervised batch loss" in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict["supervised batch loss"] / accumulation_steps
        else:
            output = {"supervised batch loss": loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(
        config,
        "output_names",
        [
            "supervised batch loss",
        ],
    )

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            "model": model,
            "optimizer": optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=output_names,
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1,
    )

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(
        model=model,
        metrics=val_metrics,
        device=device,
        non_blocking=non_blocking,
        prepare_batch=prepare_batch,
        output_transform=lambda x, y, y_pred: (
            model_output_transform(y_pred),
            y,
        ),
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )
        if with_mlflow_logging:
            common.setup_mlflow_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                },
            )

        if with_plx_logging:
            common.setup_plx_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                },
            )

        common.save_best_model_by_val_score(
            config.output_path.as_posix(),
            evaluator,
            model,
            metric_name=score_metric_name,
            trainer=trainer,
        )

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation",
            ),
            event_name=Events.EPOCH_COMPLETED,
        )

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(
                train_evaluator,
                log_handler=predictions_gt_images_handler(
                    img_denormalize_fn=config.img_denormalize,
                    n_images=15,
                    another_engine=trainer,
                    prefix_tag="validation",
                ),
                event_name=Events.EPOCH_COMPLETED,
            )

    trainer.run(train_loader, max_epochs=config.num_epochs)
Esempio n. 2
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y) / accumulation_steps

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return {
            'supervised batch loss': loss.item(),
        }

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=[
            'supervised batch loss',
        ],
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=prepare_batch,
                          device=device).attach(trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(device=device),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: (
                              model_output_transform(y_pred),
                              y,
                          ))
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        if with_mlflow_logging:
            common.setup_mlflow_logging(trainer,
                                        optimizer,
                                        evaluators={
                                            "training": train_evaluator,
                                            "validation": evaluator
                                        })

        if with_plx_logging:
            common.setup_plx_logging(trainer,
                                     optimizer,
                                     evaluators={
                                         "training": train_evaluator,
                                         "validation": evaluator
                                     })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        tb_logger.attach(train_evaluator,
                         log_handler=predictions_gt_images_handler(
                             img_denormalize_fn=config.img_denormalize,
                             n_images=15,
                             another_engine=trainer,
                             prefix_tag="training"),
                         event_name=Events.ITERATION_COMPLETED(
                             once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Esempio n. 3
0
def get_logger(
    config: Any,
    trainer: Engine,
    evaluator: Optional[Union[Engine, Dict[str, Engine]]] = None,
    optimizers: Optional[Union[Optimizer, Dict[str, Optimizer]]] = None,
    **kwargs: Any,
) -> Optional[BaseLogger]:
    """Get Ignite provided logger.

    Parameters
    ----------
    config
        Config object for setting up loggers

    `config` has to contain
    - `filepath`: logging path to output file
    - `logger_log_every_iters`: logging iteration interval for loggers

    trainer
        trainer engine
    evaluator
        evaluator engine
    optimizers
        optimizers to log optimizer parameters
    kwargs
        optional keyword arguments passed to the logger

    Returns
    -------
    logger_handler
        Ignite provided logger instance
    """

    {% if logger_deps == 'clearml' %}
    logger_handler = common.setup_clearml_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'mlflow' %}
    logger_handler = common.setup_mlflow_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'neptune-client' %}
    logger_handler = common.setup_neptune_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'polyaxon-client' %}
    logger_handler = common.setup_plx_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'tensorboard' %}
    logger_handler = common.setup_tb_logging(
        output_path=config.output_dir,
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'visdom' %}
    logger_handler = common.setup_visdom_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'wandb' %}
    logger_handler = common.setup_wandb_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% else %}
    logger_handler = None
    {% endif %}
    return logger_handler