Example #1
0
def test_save_best_model_by_val_score(dirname, capsys):

    trainer = Engine(lambda e, b: None)
    evaluator = Engine(lambda e, b: None)
    model = DummyModel()

    acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.5, 0.6, 0.61, 0.7, 0.5]

    @trainer.on(Events.EPOCH_COMPLETED)
    def validate(engine):
        evaluator.run([
            0,
        ])

    @evaluator.on(Events.EPOCH_COMPLETED)
    def set_eval_metric(engine):
        engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]}

    save_best_model_by_val_score(dirname,
                                 evaluator,
                                 model,
                                 metric_name="acc",
                                 n_saved=2,
                                 trainer=trainer)

    data = [
        0,
    ]
    trainer.run(data, max_epochs=len(acc_scores))

    assert set(os.listdir(dirname)) == set(
        ["best_model_8_val_acc=0.6100.pt", "best_model_9_val_acc=0.7000.pt"])
Example #2
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y) / accumulation_steps

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return {
            'supervised batch loss': loss.item(),
        }

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=[
            'supervised batch loss',
        ],
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=prepare_batch,
                          device=device).attach(trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(device=device),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: (
                              model_output_transform(y_pred),
                              y,
                          ))
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        if with_mlflow_logging:
            common.setup_mlflow_logging(trainer,
                                        optimizer,
                                        evaluators={
                                            "training": train_evaluator,
                                            "validation": evaluator
                                        })

        if with_plx_logging:
            common.setup_plx_logging(trainer,
                                     optimizer,
                                     evaluators={
                                         "training": train_evaluator,
                                         "validation": evaluator
                                     })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        tb_logger.attach(train_evaluator,
                         log_handler=predictions_gt_images_handler(
                             img_denormalize_fn=config.img_denormalize,
                             n_images=15,
                             another_engine=trainer,
                             prefix_tag="training"),
                         event_name=Events.ITERATION_COMPLETED(
                             once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
            # TODO: make sure `valid_state.metrics` is ordered so that reported default metric to NNI is always the same
            nni.report_intermediate_result({'default': valid_state.metrics.values()[0], **train_metrics, **valid_metrics})

    if backend_conf.rank == 0:
        event = Events.ITERATION_COMPLETED(every=hp['log_progress_every_iters'] if hp['log_progress_every_iters'] else None)
        ProgressBar(persist=False, desc='Train evaluation').attach(train_evaluator, event_name=event)
        ProgressBar(persist=False, desc='Test evaluation').attach(valid_evaluator)

        log_handler = OutputHandler(tag='train', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer))
        tb_logger.attach(train_evaluator, log_handler=log_handler, event_name=Events.COMPLETED)

        log_handler = OutputHandler(tag='test', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer))
        tb_logger.attach(valid_evaluator, log_handler=log_handler, event_name=Events.COMPLETED)

        # Store the best model by validation accuracy:
        common.save_best_model_by_val_score(str(output_path), valid_evaluator, model=model, metric_name='accuracy', n_saved=3, trainer=trainer, tag='val')

        if hp['log_grads_every_iters'] is not None and hp['log_grads_every_iters'] > 0:
            tb_logger.attach(trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=hp['log_grads_every_iters']))

    if hp['crash_iteration'] is not None and hp['crash_iteration'] >= 0:
        @trainer.on(Events.ITERATION_STARTED(once=hp['crash_iteration']))
        def _(engine):
            raise Exception('STOP at iteration: {}'.format(engine.state.iteration))

    if nni_compression_pruner is not None:
        # Notify NNI compressor (pruning or quantization) for each epoch and eventually each steps/batch-iteration if need by provided Pruner/Quantizer (see NNI Compression Documentation for more details: https://nni.readthedocs.io/en/latest/Compressor/QuickStart.html#apis-for-updating-fine-tuning-status)
        @trainer.on(Events.EPOCH_STARTED)
        def _nni_compression_update_epoch(engine):
            nni_compression_pruner.update_epoch(engine.state.epoch)
Example #4
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-QAT-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path=config["output_path"],
        evaluator=evaluator,
        model=model,
        metric_name="Accuracy",
        n_saved=1,
        trainer=trainer,
        tag="test",
    )

    trainer.run(train_loader, max_epochs=config["num_epochs"])

    if rank == 0:
        tb_logger.close()
Example #5
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    if not hasattr(config, "prepare_batch"):
        config.prepare_batch = _prepare_batch

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=config.prepare_batch).attach(
                              trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    @trainer.on(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1))
        | Events.COMPLETED)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        config.output_path.as_posix(),
        evaluator,
        model=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = exp_tracking.setup_logging(trainer,
                                                         optimizer,
                                                         evaluators={
                                                             "training":
                                                             train_evaluator,
                                                             "validation":
                                                             evaluator
                                                         })

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2),
        )

        tb_logger.attach(
            train_evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="training"),
            event_name=Events.ITERATION_COMPLETED(
                once=len(train_eval_loader) // 2),
        )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
Example #6
0
def run(output_path, config):

    distributed = dist.is_available() and dist.is_initialized()
    rank = dist.get_rank() if distributed else 0

    manual_seed(config["seed"] + rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = utils.get_dataflow(config, distributed)
    model, optimizer = utils.get_model_optimizer(config, distributed)
    criterion = nn.CrossEntropyLoss().to(utils.device)

    le = len(train_loader)
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    # Setup Ignite trainer:
    # - let's define training step
    # - add other common handlers:
    #    - TerminateOnNan,
    #    - handler to setup learning rate scheduling,
    #    - ModelCheckpoint
    #    - RunningAverage` on `train_step` output
    #    - Two progress bars on epochs and optionally on iterations

    def train_step(engine, batch):

        x = convert_tensor(batch[0], device=utils.device, non_blocking=True)
        y = convert_tensor(batch[1], device=utils.device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            "batch loss": loss.item(),
        }

    if config["deterministic"] and rank == 0:
        print("Setup deterministic trainer")
    trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step)
    train_sampler = train_loader.sampler if distributed else None
    to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
    metric_names = [
        "batch loss",
    ]
    common.setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        output_names=metric_names,
        with_pbar_on_iters=config["display_iters"],
        log_every_iters=10,
    )

    if rank == 0:
        # Setup Tensorboard logger - wrapper on SummaryWriter
        tb_logger = TensorboardLogger(log_dir=output_path)
        # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration
        tb_logger.attach(
            trainer,
            log_handler=OutputHandler(tag="train", metric_names=metric_names),
            event_name=Events.ITERATION_COMPLETED,
        )
        # log optimizer's parameters: "lr" every iteration
        tb_logger.attach(
            trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED
        )

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(device=utils.device if distributed else None),
        "loss": Loss(criterion, device=utils.device if distributed else None),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)

    def run_validation(engine):
        train_evaluator.run(train_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup progress bar on evaluation engines
        if config["display_iters"]:
            ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False, desc="Test evaluation").attach(evaluator)

        # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            train_evaluator,
            log_handler=OutputHandler(
                tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Store 3 best models by validation accuracy:
        common.save_best_model_by_val_score(
            output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test"
        )

        # Optionally log model gradients
        if config["log_model_grads_every"] is not None:
            tb_logger.attach(
                trainer,
                log_handler=GradsHistHandler(model, tag=model.__class__.__name__),
                event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]),
            )

    # In order to check training resuming we can emulate a crash
    if config["crash_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"]))
        def _(engine):
            raise Exception("STOP at iteration: {}".format(engine.state.iteration))

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
        print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
        checkpoint = torch.load(checkpoint_fp.as_posix())
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Example #7
0
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)

    criterion = config.criterion.to(device)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):
        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(config, "output_names", [
        'supervised batch loss',
    ])

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        output_names=output_names,
        with_pbars=True,
        with_pbar_on_iters=with_pbar_on_iters,
        log_every_iters=1)

    def output_transform(output):
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes,
                                output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: {
                              'y_pred': model_output_transform(y_pred),
                              'y': y
                          })
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        common.setup_mlflow_logging(trainer,
                                    optimizer,
                                    evaluators={
                                        "training": train_evaluator,
                                        "validation": evaluator
                                    })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(
                                 img_denormalize_fn=config.img_denormalize,
                                 n_images=15,
                                 another_engine=trainer,
                                 prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(
                                 once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Example #8
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

        if config["with_trains"]:
            trains_logger = common.setup_trains_logging(
                trainer,
                optimizer,
                evaluators=evaluators,
                project_name="cifar10-ignite",
                task_name=Path(output_path).stem,
            )

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test"
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(trainer.state.iteration))
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
        if config["with_trains"]:
            trains_logger.close()
Example #9
0
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    unsup_train_loader = config.unsup_train_loader
    unsup_train_sampler = getattr(unsup_train_loader, "sampler", None)
    assert unsup_train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(unsup_train_loader))
    assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \
        "Unsupervised train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2)
    model = DDP(model, delay_allreduce=True)
    
    criterion = config.criterion.to(device)
    unsup_criterion = config.unsup_criterion.to(device)
    unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform", lambda x: x)

    def cycle(seq):
        while True:
            for i in seq:
                yield i

    unsup_train_loader_iter = cycle(unsup_train_loader)

    def supervised_loss(batch):
        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)
        return loss

    def unsupervised_loss(x):

        with torch.no_grad():
            y_pred_orig = model(x)

            # Data augmentation: geom only
            k = random.randint(1, 3)
            x_aug = torch.rot90(x, k=k, dims=(2, 3))
            y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3))
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(2, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) 
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(3, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) 

            y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long()

        y_pred_aug = model(x_aug.detach())

        loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach())

        return loss

    def train_update_function(engine, batch):
        model.train()

        loss = supervised_loss(batch)
        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}
        
        # Difference with original UDA
        # Apply separately grads from supervised/unsupervised parts
        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        unsup_batch = next(unsup_train_loader_iter)
        unsup_x = unsup_batch['image']
        unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking)

        for _ in range(unsup_batch_num_repetitions):
            unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x)

            assert isinstance(unsup_loss, torch.Tensor)
            output['unsupervised batch loss'] = unsup_loss.item()

            with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss:
                scaled_loss.backward()

            if engine.state.iteration % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        unsup_batch = None
        unsup_x = None

        total_loss = loss +  unsup_loss
        output['total batch loss'] = total_loss.item()

        return output

    output_names = getattr(config, "output_names", 
                           ['supervised batch loss', 'unsupervised batch loss', 'total batch loss'])

    trainer = Engine(train_update_function)

    @trainer.on(Events.STARTED)
    def init(engine):
        if hasattr(config, "unsup_lambda_min"):
            engine.state.unsup_lambda = config.unsup_lambda_min
        else:
            engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001)

    @trainer.on(Events.ITERATION_COMPLETED)
    def update_unsup_params(engine):        
        engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001)
        if hasattr(config, "unsup_lambda_max"):
            m = config.unsup_lambda_max
            engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m

    common.setup_common_distrib_training_handlers(
        trainer, train_sampler,
        to_save={'model': model, 'optimizer': optimizer},
        save_every_iters=1000, output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler, output_names=output_names,
        with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1
    )

    def output_transform(output):        
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(
        model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch,
        output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y}
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer,
                                            evaluators={"training": train_evaluator, "validation": evaluator})
        common.setup_mlflow_logging(trainer, optimizer,
                                    evaluators={"training": train_evaluator, "validation": evaluator})

        common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model,
                                            metric_name=score_metric_name, trainer=trainer)

        # Log unsup_lambda
        @trainer.on(Events.ITERATION_COMPLETED(every=100))
        def tblog_unsupervised_lambda(engine):
            tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration)
            mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration)

        # Log train/val predictions:
        tb_logger.attach(evaluator,
                         log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                   n_images=15,
                                                                   another_engine=trainer,
                                                                   prefix_tag="validation"),
                         event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                       n_images=15,
                                                                       another_engine=trainer,
                                                                       prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Example #10
0
def finetune_model(model_class,
                   project_path,
                   batch_size,
                   num_workers=0,
                   pin_memory=True,
                   non_blocking=True,
                   device=None,
                   base_lr=1e-4,
                   max_lr=1e-3,
                   lr_gamma=0.9,
                   lr_decay_iters=None,
                   weight_decay=0.0,
                   loss_func=None,
                   n_epochs=1,
                   patience=-1,
                   data_augmentation=True,
                   combination_module=simple_concatenation,
                   combination_size=KinshipClassifier.FACENET_OUT_SIZE * 2,
                   simple_fc_layers=None,
                   custom_fc_layers=None,
                   final_fc_layers=None,
                   train_ds_name=None,
                   dev_ds_name=None,
                   logging_rate=-1,
                   saving_rate=-1,
                   experiment_name=None,
                   checkpoint_name=None,
                   hof_size=1,
                   checkpoint_exp=None):
    if device is None:
        device = torch.device('cpu')

    if loss_func is None:
        loss_func = torch.nn.CrossEntropyLoss()

    if simple_fc_layers is None:
        simple_fc_layers = [1024]

    if custom_fc_layers is None:
        custom_fc_layers = [1024]

    if final_fc_layers is None:
        final_fc_layers = []

    if train_ds_name is None:
        train_ds_name = 'train_dataset.pkl'

    if dev_ds_name is None:
        dev_ds_name = 'dev_dataset.pkl'

    if checkpoint_exp is None:
        checkpoint_exp = experiment_name

    model = model_class(combination_module, combination_size, simple_fc_layers,
                        custom_fc_layers, final_fc_layers)

    data_path = os.path.join(project_path, 'data')
    processed_path = os.path.join(data_path, 'processed')

    dataset_names = {'train': train_ds_name, 'dev': dev_ds_name}
    dataset_paths = {
        partition: os.path.join(data_path, dataset_names[partition])
        for partition in dataset_names
    }
    raw_paths = {
        partition: os.path.join(processed_path, partition)
        for partition in dataset_paths
    }

    relationships_path = os.path.join(data_path, 'raw',
                                      'train_relationships.csv')
    datasets = {
        partition: KinshipDataset.get_dataset(
            dataset_paths[partition], raw_paths[partition], relationships_path,
            data_augmentation and (partition == 'train'))
        for partition in raw_paths
    }

    dataloaders = {
        partition: DataLoader(datasets[partition],
                              batch_size=batch_size,
                              shuffle=(partition == 'train'),
                              num_workers=num_workers,
                              pin_memory=pin_memory)
        for partition in datasets
    }

    params_to_train = list(
        filter(lambda x: x.requires_grad, model.parameters()))
    optimizer = optim.AdamW(params_to_train,
                            lr=base_lr,
                            weight_decay=weight_decay)
    lr_decay_iters = len(
        dataloaders['train']) if lr_decay_iters is None else lr_decay_iters
    lr_scheduler = optim.lr_scheduler.CyclicLR(optimizer,
                                               base_lr=base_lr,
                                               max_lr=max_lr,
                                               step_size_up=lr_decay_iters //
                                               2,
                                               mode='exp_range',
                                               gamma=lr_gamma,
                                               cycle_momentum=False)

    train_engine = create_supervised_trainer(model,
                                             optimizer,
                                             loss_fn=loss_func,
                                             device=device,
                                             non_blocking=non_blocking)

    if checkpoint_exp is not None and checkpoint_name is not None:
        experiment_dir = os.path.join(project_path, 'experiments',
                                      checkpoint_exp)
        model, optimizer, loss_func, lr_scheduler, train_engine = load_checkpoint(
            model_class, experiment_dir, checkpoint_name, device)

    eval_engine = create_supervised_evaluator(
        model,
        metrics=dict(accuracy=Accuracy(), cross_entropy=Loss(loss_func)),
        device=device,
        non_blocking=non_blocking)

    metrics = {}

    if logging_rate > 0:
        metrics['ce_history'] = []
        metrics['smoothed_loss_history'] = []
        beta = 0.98
        avg_loss = 0.0

        @train_engine.on(Events.ITERATION_COMPLETED(every=logging_rate))
        def log_iteration_training_metrics(engine):
            nonlocal metrics
            metrics['ce_history'].append(engine.state.output)

        @train_engine.on(Events.ITERATION_COMPLETED(every=logging_rate))
        def log_smoothed_lr(engine: Engine):
            nonlocal avg_loss
            avg_loss = (avg_loss * beta) + (engine.state.output * (1 - beta))
            metrics['smoothed_loss_history'].append(
                avg_loss /
                (1 - (beta**(len(metrics['smoothed_loss_history']) + 1))))

        @train_engine.on(Events.EPOCH_COMPLETED)
        def plot_metrics(engine):
            plot_metric(metrics['smoothed_loss_history'],
                        f"Smoothed loss epoch #{engine.state.epoch}",
                        "Cross Entropy",
                        index_scale=logging_rate)

    if patience >= 0:
        common.add_early_stopping_by_val_score(patience, eval_engine,
                                               train_engine, 'accuracy')

    # Replaced by setup_common_training_handlers
    # nan_terminate = TerminateOnNan()
    # train_engine.add_event_handler(Events.ITERATION_COMPLETED, nan_terminate)

    @train_engine.on(Events.EPOCH_COMPLETED)
    def print_training_metrics(engine):
        print(f"Finished epoch {engine.state.epoch}")
        if train_ds_name == dev_ds_name:
            print(f"Epoch {engine.state.epoch}: CE = {engine.state.output}")
            metrics['final_dev_loss'] = engine.state.output
            return
        eval_engine.run(dataloaders['dev'])
        metrics['final_dev_loss'] = eval_engine.state.metrics['cross_entropy']
        print(
            f"Epoch {engine.state.epoch}: CE = {eval_engine.state.metrics['cross_entropy']}, "
            f"Acc = {eval_engine.state.metrics['accuracy']}")

    # Replaced by setup_common_training_handlers
    # @train_engine.on(Events.ITERATION_COMPLETED)
    # def change_lr(engine):
    #     lr_scheduler.step()

    to_save = None
    output_path = None

    if saving_rate > 0:
        if experiment_name is None:
            print("Warning: saving rate specified but experiment name is None")
            exit()

        experiment_path = os.path.join(project_path, 'experiments',
                                       experiment_name)
        if not os.path.isdir(experiment_path):
            os.mkdir(experiment_path)

        with open(os.path.join(experiment_path, 'model.config'),
                  'wb+') as config_file:
            pickle.dump(model.get_configuration(), config_file)

        to_save = {
            'model': model,
            'optimizer': optimizer,
            'loss_func': loss_func,
            'lr_scheduler': lr_scheduler,
            'train_engine': train_engine
        }
        output_path = experiment_path

        best_models_dir = os.path.join(output_path, 'best_models')
        if not os.path.isdir(best_models_dir):
            os.mkdir(best_models_dir)
        common.save_best_model_by_val_score(best_models_dir,
                                            eval_engine,
                                            model,
                                            'accuracy',
                                            n_saved=hof_size,
                                            trainer=train_engine,
                                            tag='acc')

        # Replaced by setup_common_training_handlers
        # checkpointer = ModelCheckpoint(experiment_path, 'iter', n_saved=50,
        #                                global_step_transform=lambda engine, _:
        #                                f"{engine.state.epoch}-{engine.state.iteration}", require_empty=False)
        # train_engine.add_event_handler(Events.ITERATION_COMPLETED(every=saving_rate), checkpointer, to_save)

    common.setup_common_training_handlers(train_engine,
                                          to_save=to_save,
                                          save_every_iters=saving_rate,
                                          output_path=output_path,
                                          lr_scheduler=lr_scheduler,
                                          with_pbars=True,
                                          with_pbar_on_iters=True,
                                          log_every_iters=1,
                                          device=device)

    # Replaced by setup_common_training_handlers
    # train_pbar = ProgressBar()
    # train_pbar.attach(train_engine)
    #
    eval_pbar = ProgressBar(persist=False, desc="Evaluation")
    eval_pbar.attach(eval_engine)

    print(model)
    print("Running on:", device)
    train_engine.run(dataloaders['train'], max_epochs=n_epochs)

    return model, metrics
Example #11
0
def run(output_path, config):
    device = "cuda"

    local_rank = config["local_rank"]
    distributed = backend is not None
    if distributed:
        torch.cuda.set_device(local_rank)
        device = "cuda"
    rank = dist.get_rank() if distributed else 0

    torch.manual_seed(config["seed"] + rank)

    # Rescale batch_size and num_workers
    ngpus_per_node = torch.cuda.device_count()
    ngpus = dist.get_world_size() if distributed else 1
    batch_size = config["batch_size"] // ngpus
    num_workers = int(
        (config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node)

    train_loader, test_loader = get_train_test_loaders(
        path=config["data_path"],
        batch_size=batch_size,
        distributed=distributed,
        num_workers=num_workers,
    )

    model = get_model(config["model"])
    model = model.to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[
                local_rank,
            ],
            output_device=local_rank,
        )

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_loader)
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (
            convert_tensor(x, device=device, non_blocking=non_blocking),
            convert_tensor(y, device=device, non_blocking=non_blocking),
        )

    def process_function(engine, batch):

        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            "batch loss": loss.item(),
        }

    trainer = Engine(process_function)
    train_sampler = train_loader.sampler if distributed else None
    to_save = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler,
    }
    metric_names = [
        "batch loss",
    ]
    common.setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        output_names=metric_names,
        with_pbar_on_iters=config["display_iters"],
        log_every_iters=10,
    )

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=output_path)
        tb_logger.attach(
            trainer,
            log_handler=OutputHandler(tag="train", metric_names=metric_names),
            event_name=Events.ITERATION_COMPLETED,
        )
        tb_logger.attach(
            trainer,
            log_handler=OptimizerParamsHandler(optimizer, param_name="lr"),
            event_name=Events.ITERATION_STARTED,
        )

    metrics = {
        "accuracy": Accuracy(device=device if distributed else None),
        "loss": Loss(criterion, device=device if distributed else None),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        torch.cuda.synchronize()
        train_evaluator.run(train_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(
        Events.EPOCH_STARTED(every=config["validate_every"]), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        if config["display_iters"]:
            ProgressBar(persist=False,
                        desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False,
                        desc="Test evaluation").attach(evaluator)

        tb_logger.attach(
            train_evaluator,
            log_handler=OutputHandler(
                tag="train",
                metric_names=list(metrics.keys()),
                global_step_transform=global_step_from_engine(trainer),
            ),
            event_name=Events.COMPLETED,
        )

        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="test",
                metric_names=list(metrics.keys()),
                global_step_transform=global_step_from_engine(trainer),
            ),
            event_name=Events.COMPLETED,
        )

        # Store the best model by validation accuracy:
        common.save_best_model_by_val_score(
            output_path,
            evaluator,
            model=model,
            metric_name="accuracy",
            n_saved=3,
            trainer=trainer,
            tag="test",
        )

        if config["log_model_grads_every"] is not None:
            tb_logger.attach(
                trainer,
                log_handler=GradsHistHandler(model,
                                             tag=model.__class__.__name__),
                event_name=Events.ITERATION_COMPLETED(
                    every=config["log_model_grads_every"]),
            )

    if config["crash_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"]))
        def _(engine):
            raise Exception("STOP at iteration: {}".format(
                engine.state.iteration))

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(
            checkpoint_fp.as_posix())
        print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
        checkpoint = torch.load(checkpoint_fp.as_posix())
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
def get_handlers(
    config: Any,
    model: Module,
    trainer: Engine,
    evaluator: Engine,
    metric_name: str,
    es_metric_name: str,
    train_sampler: Optional[DistributedSampler] = None,
    to_save: Optional[Mapping] = None,
    lr_scheduler: Optional[LRScheduler] = None,
    output_names: Optional[Iterable[str]] = None,
    **kwargs: Any,
) -> Union[Tuple[Checkpoint, EarlyStopping, Timer], Tuple[None, None, None]]:
    """Get best model, earlystopping, timer handlers.

    Parameters
    ----------
    config
        Config object for setting up handlers

    `config` has to contain
    - `output_dir`: output path to indicate where to_save objects are stored
    - `save_every_iters`: saving iteration interval
    - `n_saved`: number of best models to store
    - `log_every_iters`: logging interval for iteration progress bar and `GpuInfo` if true
    - `with_pbars`: show two progress bars
    - `with_pbar_on_iters`: show iteration-wise progress bar
    - `stop_on_nan`: Stop the training if engine output contains NaN/inf values
    - `clear_cuda_cache`: clear cuda cache every end of epoch
    - `with_gpu_stats`: show GPU information: used memory percentage, gpu utilization percentage values
    - `patience`: number of events to wait if no improvement and then stop the training
    - `limit_sec`: maximum time before training terminates in seconds

    model
        best model to save
    trainer
        the engine used for training
    evaluator
        the engine used for evaluation
    metric_name
        evaluation metric to save the best model
    es_metric_name
        evaluation metric to early stop the model
    train_sampler
        distributed training sampler to call `set_epoch`
    to_save
        objects to save during training
    lr_scheduler
        learning rate scheduler as native torch LRScheduler or ignite’s parameter scheduler
    output_names
        list of names associated with `trainer`'s process_function output dictionary
    kwargs
        keyword arguments passed to Checkpoint handler

    Returns
    -------
    best_model_handler, es_handler, timer_handler
    """

    best_model_handler, es_handler, timer_handler = None, None, None

    # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.setup_common_training_handlers
    # kwargs can be passed to save the model based on training stats
    # like score_name, score_function
    common.setup_common_training_handlers(
        trainer=trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=output_names,
        output_path=config.output_dir / 'checkpoints',
        save_every_iters=config.save_every_iters,
        n_saved=config.n_saved,
        log_every_iters=config.log_every_iters,
        with_pbars=config.with_pbars,
        with_pbar_on_iters=config.with_pbar_on_iters,
        stop_on_nan=config.stop_on_nan,
        clear_cuda_cache=config.clear_cuda_cache,
        with_gpu_stats=config.with_gpu_stats,
        **kwargs,
    )
    {% if save_best_model_by_val_score %}

    # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.save_best_model_by_val_score
    best_model_handler = common.save_best_model_by_val_score(
        output_path=config.output_dir / 'checkpoints',
        evaluator=evaluator,
        model=model,
        metric_name=metric_name,
        n_saved=config.n_saved,
        trainer=trainer,
        tag='eval',
    )
    {% endif %}
    {% if add_early_stopping_by_val_score %}

    # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.add_early_stopping_by_val_score
    es_handler = common.add_early_stopping_by_val_score(
        patience=config.patience,
        evaluator=evaluator,
        trainer=trainer,
        metric_name=es_metric_name,
    )
    {% endif %}
    {% if setup_timer %}

    # https://pytorch.org/ignite/handlers.html#ignite.handlers.Timer
    # measure the average time to process a single batch of samples
    # Events for that are - ITERATION_STARTED and ITERATION_COMPLETED
    # you can replace with the events you want to measure
    timer_handler = Timer(average=True)
    timer_handler.attach(
        engine=trainer,
        start=Events.EPOCH_STARTED,
        resume=Events.ITERATION_STARTED,
        pause=Events.ITERATION_COMPLETED,
        step=Events.ITERATION_COMPLETED,
    )
    {% endif %}
    {% if setup_timelimit %}

    # training will terminate if training time exceed `limit_sec`.
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, TimeLimit(limit_sec=config.limit_sec)
    )
    {% endif %}
    return best_model_handler, es_handler, timer_handler