Beispiel #1
0
    def _test(with_eval, with_optim):
        trainer = Engine(lambda e, b: b)
        evaluators = None
        optimizers = None

        if with_eval:
            evaluator = Engine(lambda e, b: None)
            acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.3, 0.2, 0.1, 0.1, 0.0]

            @trainer.on(Events.EPOCH_COMPLETED)
            def validate(engine):
                evaluator.run(
                    [0,]
                )

            @evaluator.on(Events.EPOCH_COMPLETED)
            def set_eval_metric(engine):
                engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]}

            evaluators = {"validation": evaluator}

        if with_optim:
            t = torch.tensor([0,])
            optimizers = {"optimizer": torch.optim.SGD([t,], lr=0.01)}

        setup_tb_logging(dirname, trainer, optimizers=optimizers, evaluators=evaluators, log_every_iters=1)

        handlers = trainer._event_handlers[Events.ITERATION_COMPLETED]
        for cls in [
            tb_logger_module.OutputHandler,
        ]:
            assert any([isinstance(h[0], cls) for h in handlers]), "{}".format(handlers)

        if with_optim:
            handlers = trainer._event_handlers[Events.ITERATION_STARTED]
            for cls in [
                tb_logger_module.OptimizerParamsHandler,
            ]:
                assert any([isinstance(h[0], cls) for h in handlers]), "{}".format(handlers)

        if with_eval:
            handlers = evaluator._event_handlers[Events.COMPLETED]
            for cls in [
                tb_logger_module.OutputHandler,
            ]:
                assert any([isinstance(h[0], cls) for h in handlers]), "{}".format(handlers)

        data = [0, 1, 2]
        trainer.run(data, max_epochs=10)

        tb_files = list(os.listdir(dirname))
        assert len(tb_files) == 1
        for v in [
            "events",
        ]:
            assert any([v in c for c in tb_files]), "{}".format(tb_files)
Beispiel #2
0
def training(local_rank, config):

    # Setup dataflow and
    train_loader, val_loader = get_dataflow(config)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Setup model trainer and evaluator
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, config)
    evaluator = create_supervised_evaluator(model, metrics={"accuracy": Accuracy()}, device=idist.device())

    # Run model evaluation every 3 epochs and show results
    @trainer.on(Events.EPOCH_COMPLETED(every=3))
    def evaluate_model():
        state = evaluator.run(val_loader)
        if idist.get_rank() == 0:
            print(state.metrics)

    # Setup tensorboard experiment tracking
    if idist.get_rank() == 0:
        tb_logger = common.setup_tb_logging(
            config.get("output_path", "output"), trainer, optimizer, evaluators={"validation": evaluator},
        )

    trainer.run(train_loader, max_epochs=config.get("max_epochs", 3))

    if idist.get_rank() == 0:
        tb_logger.close()
Beispiel #3
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y) / accumulation_steps

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return {
            'supervised batch loss': loss.item(),
        }

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=[
            'supervised batch loss',
        ],
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=prepare_batch,
                          device=device).attach(trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(device=device),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: (
                              model_output_transform(y_pred),
                              y,
                          ))
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        if with_mlflow_logging:
            common.setup_mlflow_logging(trainer,
                                        optimizer,
                                        evaluators={
                                            "training": train_evaluator,
                                            "validation": evaluator
                                        })

        if with_plx_logging:
            common.setup_plx_logging(trainer,
                                     optimizer,
                                     evaluators={
                                         "training": train_evaluator,
                                         "validation": evaluator
                                     })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        tb_logger.attach(train_evaluator,
                         log_handler=predictions_gt_images_handler(
                             img_denormalize_fn=config.img_denormalize,
                             n_images=15,
                             another_engine=trainer,
                             prefix_tag="training"),
                         event_name=Events.ITERATION_COMPLETED(
                             once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Beispiel #4
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Beispiel #5
0
def training(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    manual_seed(config.seed + local_rank)

    train_loader = config.train_loader
    val_loader = config.val_loader
    train_eval_loader = config.train_eval_loader

    model, optimizer, criterion = utils.initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val")
    train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train")

    val_interval = config.get("val_interval", 1)

    # Run validation on every val_interval epoch, in the end of the training
    # and in the begining if config.start_by_validation is True
    event = Events.EPOCH_COMPLETED(every=val_interval)
    if config.num_epochs % val_interval != 0:
        event |= Events.COMPLETED
    if config.get("start_by_validation", False):
        event |= Events.STARTED

    @trainer.on(event)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(val_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    score_metric_name = "mIoU_bg"
    if "es_patience" in config:
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    # Store 2 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=2,
        trainer=trainer,
        tag="val",
    )

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={"training": train_evaluator, "validation": evaluator},
        )

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        # Image denormalization function to plot predictions with images
        mean = config.get("mean", (0.485, 0.456, 0.406))
        std = config.get("std", (0.229, 0.224, 0.225))
        img_denormalize = partial(data.denormalize, mean=mean, std=std)

        tb_logger.attach(
            evaluator,
            log_handler=vis.predictions_gt_images_handler(
                img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation",
            ),
            event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter),
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration)

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
Beispiel #6
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            task.connect(config)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    logger.info(
        f"# model parameters (M): {sum([m.numel() for m in model.parameters()]) * 1e-6}"
    )

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and
    # compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    if config["smoke_test"]:
        logger.info(
            "Reduce the size of training and test dataloader as smoke_test=True"
        )

        def get_batches(loader):
            loader_iter = iter(loader)
            return [next(loader_iter) for _ in range(5)]

        train_loader = get_batches(train_loader)
        test_loader = get_batches(test_loader)

    if config["with_pbar"] and rank == 0:
        ProgressBar(desc="Evaluation (train)",
                    persist=False).attach(train_evaluator)
        ProgressBar(desc="Evaluation (val)", persist=False).attach(evaluator)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED,
        run_validation,
    )

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 1 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=1,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler,
    )

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Beispiel #7
0
def main():
  parser = argparse.ArgumentParser()

  # Required parameters
  parser.add_argument("--model", type=str, default='ffn', help="model's name")
  parser.add_argument("--mode", type=int, choices=[0, 1, 2], default=None)
  parser.add_argument("--SNRdb", type=float, default=None)
  parser.add_argument("--pilot_version", type=int, choices=[1, 2], default=1)
  parser.add_argument("--loss_type", type=str, default="BCELoss")
  parser.add_argument("--train_batch_size", type=int, default=128)
  parser.add_argument("--valid_batch_size", type=int, default=128)
  parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
  parser.add_argument("--max_norm", type=float, default=-1)
  parser.add_argument("--lr", type=float, default=1e-3)
  parser.add_argument("--noise_lambda", type=float, default=1.0)
  parser.add_argument("--lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default="linear")
  parser.add_argument("--reset_lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default=None)
  parser.add_argument("--reset_trainer", action='store_true')
  parser.add_argument("--modify_model", action='store_true')
  parser.add_argument("--wd", type=float, default=1e-4, help="weight decay")
  parser.add_argument("--eval_iter", type=int, default=10)
  parser.add_argument("--save_iter", type=int, default=10)
  parser.add_argument("--n_epochs", type=int, default=10)
  parser.add_argument("--flush_dataset", type=int, default=0)
  parser.add_argument("--no_cache", action='store_true')
  parser.add_argument("--with_pure_y", action='store_true') 
  parser.add_argument("--with_h", action='store_true') 
  parser.add_argument("--only_l1", action='store_true', help="Only loss 1")
  parser.add_argument("--interpolation", action='store_true', help="if interpolate between pure and reconstruction.") 
  parser.add_argument("--data_dir", type=str, default="data")
  parser.add_argument("--cache_dir", type=str, default="train_cache")
  parser.add_argument("--output_path", type=str, default="runs", help="model save")
  parser.add_argument("--resume_from", type=str, default=None, help="resume training.")
  parser.add_argument("--first_cache_index", type=int, default=0)
  parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                      help="Device (cuda or cpu)")
  parser.add_argument("--local_rank", type=int, default=-1,
                      help="Local rank for distributed training (-1: not distributed)")
  parser.add_argument("--seed", type=int, default=43)
  parser.add_argument("--debug", action='store_true')
  args = parser.parse_args()

  args.output_path = os.path.join(args.output_path, f'pilot_{args.pilot_version}')
  args.cache_dir = os.path.join(args.data_dir, args.cache_dir)
  # Setup CUDA, GPU & distributed training
  args.distributed = (args.local_rank != -1)
  if not args.distributed:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl", init_method='env://')
  args.n_gpu = torch.cuda.device_count() if not args.distributed else 1
  args.device = device

  # Set seed
  set_seed(args)
  logger = setup_logger("trainer", distributed_rank=args.local_rank)

  # Model construction
  model = getattr(models, args.model)(args)
  model = model.to(device)
  optimizer = AdamW(model.parameters(), lr = args.lr, weight_decay=args.wd)

  if args.loss_type == "MSELoss":
    criterion = nn.MSELoss(reduction='sum').to(device)
  else:
    criterion = getattr(nn, args.loss_type, getattr(auxiliary, args.loss_type, None))().to(device)
  criterion2 = nn.MSELoss(reduction='sum').to(device)

  if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
    )

  train_dataset = SIGDataset(args, data_type="train")
  valid_dataset = SIGDataset(args, data_type="valid")
  train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
  valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
  train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, pin_memory=True, shuffle=(not args.distributed))
  valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, pin_memory=True, shuffle=False)
  
  lr_scheduler = None
  if args.lr_scheduler == "linear":
    lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
  elif args.lr_scheduler == "cycle":
    lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
  elif args.lr_scheduler == "cosine":
    lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  # Training function and trainer
  def update(engine, batch):
      model.train()
      y, x_label, y_pure, H = train_dataset.prepare_batch(batch, device=args.device)

      if args.with_pure_y and args.with_h:
        x_pred, y_pure_pred, H_pred = model(y, pure=y_pure, H=H, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        if args.loss_type == "MSELoss":
          loss_1 = loss_1 / x_pred.size(0)
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss_noise_h = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        if args.only_l1:
          loss = loss_1
        else:
          loss = loss_1 + loss_noise * args.noise_lambda + loss_noise_h
        output = (loss.item(), loss_1.item(), loss_noise.item(), loss_noise_h.item())
      elif args.with_pure_y:
        x_pred, y_pure_pred = model(y, pure=y_pure if args.interpolation else None, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      elif args.with_h:
        x_pred, H_pred = model(y, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      else:
        x_pred = model(y)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss = loss_1
        output = (loss.item(), loss_1.item(), torch.zeros_like(loss_1).item())

      loss.backward()
      if args.max_norm > 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
      if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
      return output
  trainer = Engine(update)

  to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
  metric_names = ["loss", "l1", "ln"]
  if args.with_pure_y and args.with_h:
    metric_names.append("lnH")

  common.setup_common_training_handlers(
    trainer=trainer,
    train_sampler=train_loader.sampler,
    to_save=to_save,
    save_every_iters=len(train_loader) * args.save_iter,
    lr_scheduler=lr_scheduler,
    output_names=metric_names,
    with_pbars=False,
    clear_cuda_cache=False,
    output_path=args.output_path,
    n_saved=2,
  )

  resume_from = args.resume_from
  if resume_from is not None:
    checkpoint_fp = Path(resume_from)
    assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
    logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
    checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu")
    if args.reset_trainer:
      to_save.pop("trainer")
    checkpoint_to_load = to_save if 'validation' not in resume_from else {"model": model}
    Checkpoint.load_objects(to_load=checkpoint_to_load, checkpoint=checkpoint)
    if args.reset_lr_scheduler is not None:
      if args.reset_lr_scheduler == "linear":
        lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
      elif args.reset_lr_scheduler == "cycle":
        lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
      elif args.reset_lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  metrics = {
    "accuracy": Accuracy(lambda output: (torch.round(output[0][0]), output[1][0])), 
    "loss_1": Loss(criterion, output_transform=lambda output: (output[0][0], output[1][0])),
    "loss_noise": Loss(criterion2, output_transform=lambda output: (output[0][1], output[1][1]))
  }
  if args.with_pure_y and args.with_h:
    metrics["loss_noise_h"] = Loss(criterion2, output_transform=lambda output: (output[0][2], output[1][2]))

  def _inference(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]:
    model.eval()
    with torch.no_grad():
      x, y, x_pure, H = valid_dataset.prepare_batch(batch, device=args.device, non_blocking=True)
      if args.with_pure_y and args.with_h:
        y_pred, x_pure_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred, h_pred), (y, x_pure, H)
      elif args.with_pure_y:
        y_pred, x_pure_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred), (y, x_pure)
      elif args.with_h:
        y_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, h_pred), (y, H)
      else:
        y_pred = model(x)
        x_pure_pred = x_pure
        outputs = (y_pred, x_pure_pred), (y, x_pure)       
      return outputs
  evaluator = Engine(_inference)
  for name, metric in metrics.items():
      metric.attach(evaluator, name)

  trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.eval_iter), lambda _: evaluator.run(valid_loader))

  if args.flush_dataset > 0:
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.n_epochs//args.flush_dataset), 
                  lambda _: train_loader.dataset.reset() if args.no_cache else train_loader.dataset.reload())

  # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
  if args.local_rank in [-1, 0]:
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=metric_names, output_transform=lambda _: {"lr": f"{optimizer.param_groups[0]['lr']:.2e}"})
    evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

    tb_logger = common.setup_tb_logging(args.output_path, trainer, optimizer, evaluators={'validation': evaluator}, log_every_iters=1)

  # Store 3 best models by validation accuracy:
  common.gen_save_best_models_by_val_score(
    save_handler=DiskSaver(args.output_path, require_empty=False),
    evaluator=evaluator,
    models={"model": model},
    metric_name="accuracy",
    n_saved=3,
    trainer=trainer,
    tag="validation"
  )

  # Run the training
  trainer.run(train_loader, max_epochs=args.n_epochs)

  if args.local_rank in [-1, 0]:
    tb_logger.close()
def training(local_rank, config, logger=None):
    #
    # if not getattr(config, "use_fp16", True):
    #     raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    val_interval = getattr(config, "val_interval", 1)

    @trainer.on(Events.EPOCH_COMPLETED(every=val_interval))
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if config.num_epochs % val_interval != 0:
        trainer.add_event_handler(Events.COMPLETED, run_validation)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = tracking.setup_logging(trainer,
                                                     optimizer,
                                                     evaluators={
                                                         "training":
                                                         train_evaluator,
                                                         "validation":
                                                         evaluator
                                                     })

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) *
                                        3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(
                event_filter=custom_event_filter),
        )

    # Log confusion matrix to Trains:

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
Beispiel #9
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-QAT-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path=config["output_path"],
        evaluator=evaluator,
        model=model,
        metric_name="Accuracy",
        n_saved=1,
        trainer=trainer,
        tag="test",
    )

    trainer.run(train_loader, max_epochs=config["num_epochs"])

    if rank == 0:
        tb_logger.close()
Beispiel #10
0
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=getattr(
                                          config, "fp16_opt_level", "O2"),
                                      num_losses=1)
    model = DDP(model, delay_allreduce=True)

    criterion = config.criterion.to(device)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):
        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(config, "output_names", [
        'supervised batch loss',
    ])

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            'model': model,
            'optimizer': optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        output_names=output_names,
        with_pbars=True,
        with_pbar_on_iters=with_pbar_on_iters,
        log_every_iters=1)

    def output_transform(output):
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes,
                                output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(model=model,
                          metrics=val_metrics,
                          device=device,
                          non_blocking=non_blocking,
                          prepare_batch=prepare_batch,
                          output_transform=lambda x, y, y_pred: {
                              'y_pred': model_output_transform(y_pred),
                              'y': y
                          })
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(),
                                            trainer,
                                            optimizer,
                                            evaluators={
                                                "training": train_evaluator,
                                                "validation": evaluator
                                            })
        common.setup_mlflow_logging(trainer,
                                    optimizer,
                                    evaluators={
                                        "training": train_evaluator,
                                        "validation": evaluator
                                    })

        common.save_best_model_by_val_score(config.output_path.as_posix(),
                                            evaluator,
                                            model,
                                            metric_name=score_metric_name,
                                            trainer=trainer)

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(
                                 img_denormalize_fn=config.img_denormalize,
                                 n_images=15,
                                 another_engine=trainer,
                                 prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(
                                 once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
Beispiel #11
0
def training(config, local_rank, with_pbar_on_iters=True):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = 'cuda'

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(train_loader))
    assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \
        "Train sampler should have a callable method `set_epoch`"

    unsup_train_loader = config.unsup_train_loader
    unsup_train_sampler = getattr(unsup_train_loader, "sampler", None)
    assert unsup_train_sampler is not None, "Train loader of type '{}' " \
                                      "should have attribute 'sampler'".format(type(unsup_train_loader))
    assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \
        "Unsupervised train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2)
    model = DDP(model, delay_allreduce=True)
    
    criterion = config.criterion.to(device)
    unsup_criterion = config.unsup_criterion.to(device)
    unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1)

    # Setup trainer
    prepare_batch = getattr(config, "prepare_batch")
    non_blocking = getattr(config, "non_blocking", True)
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform", lambda x: x)

    def cycle(seq):
        while True:
            for i in seq:
                yield i

    unsup_train_loader_iter = cycle(unsup_train_loader)

    def supervised_loss(batch):
        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)
        return loss

    def unsupervised_loss(x):

        with torch.no_grad():
            y_pred_orig = model(x)

            # Data augmentation: geom only
            k = random.randint(1, 3)
            x_aug = torch.rot90(x, k=k, dims=(2, 3))
            y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3))
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(2, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) 
            if random.random() < 0.5:
                x_aug = torch.flip(x_aug, dims=(3, ))
                y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) 

            y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long()

        y_pred_aug = model(x_aug.detach())

        loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach())

        return loss

    def train_update_function(engine, batch):
        model.train()

        loss = supervised_loss(batch)
        if isinstance(loss, Mapping):
            assert 'supervised batch loss' in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict['supervised batch loss'] / accumulation_steps
        else:
            output = {'supervised batch loss': loss.item()}
        
        # Difference with original UDA
        # Apply separately grads from supervised/unsupervised parts
        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        unsup_batch = next(unsup_train_loader_iter)
        unsup_x = unsup_batch['image']
        unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking)

        for _ in range(unsup_batch_num_repetitions):
            unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x)

            assert isinstance(unsup_loss, torch.Tensor)
            output['unsupervised batch loss'] = unsup_loss.item()

            with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss:
                scaled_loss.backward()

            if engine.state.iteration % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        unsup_batch = None
        unsup_x = None

        total_loss = loss +  unsup_loss
        output['total batch loss'] = total_loss.item()

        return output

    output_names = getattr(config, "output_names", 
                           ['supervised batch loss', 'unsupervised batch loss', 'total batch loss'])

    trainer = Engine(train_update_function)

    @trainer.on(Events.STARTED)
    def init(engine):
        if hasattr(config, "unsup_lambda_min"):
            engine.state.unsup_lambda = config.unsup_lambda_min
        else:
            engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001)

    @trainer.on(Events.ITERATION_COMPLETED)
    def update_unsup_params(engine):        
        engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001)
        if hasattr(config, "unsup_lambda_max"):
            m = config.unsup_lambda_max
            engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m

    common.setup_common_distrib_training_handlers(
        trainer, train_sampler,
        to_save={'model': model, 'optimizer': optimizer},
        save_every_iters=1000, output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler, output_names=output_names,
        with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1
    )

    def output_transform(output):        
        return output['y_pred'], output['y']

    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform)
    pr = cmPrecision(cm_metric, average=False)
    re = cmRecall(cm_metric, average=False)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
        "Accuracy": cmAccuracy(cm_metric),
        "Precision": pr,
        "Recall": re,
        "F1": Fbeta(beta=1.0, output_transform=output_transform)
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator_args = dict(
        model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch,
        output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y}
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_pbar_on_iters:
        ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(engine):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer,
                                            evaluators={"training": train_evaluator, "validation": evaluator})
        common.setup_mlflow_logging(trainer, optimizer,
                                    evaluators={"training": train_evaluator, "validation": evaluator})

        common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model,
                                            metric_name=score_metric_name, trainer=trainer)

        # Log unsup_lambda
        @trainer.on(Events.ITERATION_COMPLETED(every=100))
        def tblog_unsupervised_lambda(engine):
            tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration)
            mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration)

        # Log train/val predictions:
        tb_logger.attach(evaluator,
                         log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                   n_images=15,
                                                                   another_engine=trainer,
                                                                   prefix_tag="validation"),
                         event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2))

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(train_evaluator,
                             log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize,
                                                                       n_images=15,
                                                                       another_engine=trainer,
                                                                       prefix_tag="validation"),
                             event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2))

    trainer.run(train_loader, max_epochs=config.num_epochs)
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]:
    rank = idist.get_rank()
    if config.seed is not None:
        manual_seed(config.seed + rank)

    logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank)

    log_basic_info(logger, config)

    if rank == 0:
        prepare_output_directory(config)
        logger.info("Output path: {}".format(config.output_path))

    weak_label_mgr = get_weak_label_manager(config)

    # Setup dataflow, model, optimizer, criterion
    data_loaders = get_dataflow(config, weak_label_mgr)
    train_loader = data_loaders["train"]
    config.num_iters_per_epoch = len(train_loader)

    model, optimizer, criterion = initialize(config, weak_label_mgr)

    metrics = get_metrics(criterion)
    trainer, evaluators = create_trainer_and_evaluators(
        model, optimizer, criterion, data_loaders, metrics, config, logger
    )

    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path, trainer, optimizer, evaluators=evaluators
        )

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluators["val"],
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )
    state_at_best_val = StateAtBestVal(
        score_function=lambda: evaluators["val"].state.metrics["accuracy"],
        state_function=lambda: dict(
            {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()},
            **{
                "test_" + key: val
                for key, val in evaluators["test"].state.metrics.items()
            },
            epoch=trainer.state.epoch
        ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val)

    try:
        trainer.run(train_loader, max_epochs=config.num_epochs)
    except Exception:
        import traceback

        print(traceback.format_exc())
    else:
        assert state_at_best_val.best_state is not None
        tb_logger.writer.add_hparams(  # type: ignore
            get_hparams(config),
            {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()},
        )
    finally:
        if rank == 0:
            tb_logger.close()  # type: ignore

    return evaluators["val"].state.metrics
Beispiel #13
0
def run(
    data_path="/tmp/MNIST",
    seed=3321,
    mode="xentropy",
    noise_fraction=0.35,
    batch_size=64,
    val_batch_size=1000,
    num_epochs=50,
    lr=0.01,
    momentum=0.5,
    as_pseudo_label=None,
    log_dir="/tmp/output-bootstraping-loss/mnist/",
    with_trains=False,
):
    """Training on noisy labels with bootstrapping

    Args:
        data_path (str): Path to MNIST dataset. Default, "/tmp/MNIST"
        seed (int): Random seed to setup. Default, 3321
        mode (str): Loss function mode: cross-entropy or bootstrapping (soft, hard). 
            Choices 'xentropy', 'soft_bootstrap', 'hard_bootstrap'.
        noise_fraction (float): Label noise fraction. Default, 0.35.
        batch_size (int): Input batch size for training. Default, 64.
        val_batch_size (int): input batch size for validation. Default, 1000.
        num_epochs (int): Number of epochs to train. Default, 50.
        lr (float): Learning rate. Default, 0.01.
        momentum (float): SGD momentum. Default, 0.5.
        log_dir (str): Log directory for Tensorboard log output. Default="/tmp/output-bootstraping-loss/mnist/".
        with_trains (bool): if True, experiment Trains logger is setup. Default, False.

    """
    assert torch.cuda.is_available(), "Training should running on GPU"
    device = "cuda"

    manual_seed(seed)
    logger = setup_logger(name="MNIST-Training")

    now = datetime.now().strftime("%Y%m%d-%H%M%S")

    # Setup output path
    suffix = ""
    if mode == "soft_bootstrap" and (as_pseudo_label is not None
                                     and not as_pseudo_label):
        suffix = "as_xreg"
    output_path = Path(log_dir) / "train_{}_{}_{}_{}__{}".format(
        mode, noise_fraction, suffix, now, num_epochs)

    if not output_path.exists():
        output_path.mkdir(parents=True)

    parameters = {
        "seed": seed,
        "mode": mode,
        "noise_fraction": noise_fraction,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "lr": lr,
        "momentum": momentum,
        "as_pseudo_label": as_pseudo_label,
    }
    log_basic_info(logger, parameters)

    if with_trains:
        from trains import Task

        task = Task.init("BootstrappingLoss - Experiments on MNIST",
                         task_name=output_path.name)
        # Log hyper parameters
        task.connect(parameters)

    train_loader, test_loader = get_data_loaders(data_path, noise_fraction,
                                                 batch_size, val_batch_size)
    model = Net().to(device)
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    if mode == 'xentropy':
        criterion = nn.CrossEntropyLoss()
    elif mode == 'soft_bootstrap':
        if as_pseudo_label is None:
            as_pseudo_label = True
        criterion = SoftBootstrappingLoss(beta=0.95,
                                          as_pseudo_label=as_pseudo_label)
    elif mode == 'hard_bootstrap':
        criterion = HardBootstrappingLoss(beta=0.8)
    else:
        raise ValueError(
            "Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap"
            .format(mode))

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device,
                                        non_blocking=True)

    metrics = {
        "Accuracy": Accuracy(),
        "{} loss".format(mode): Loss(criterion),
    }
    if mode is not "xentropy":
        metrics["xentropy loss"] = Loss(nn.CrossEntropyLoss())

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED | Events.COMPLETED,
                              run_validation)

    evaluators = {"training": train_evaluator, "test": evaluator}
    tb_logger = common.setup_tb_logging(output_path,
                                        trainer,
                                        optimizer,
                                        evaluators=evaluators)

    trainer.run(train_loader, max_epochs=num_epochs)

    test_acc = evaluator.state.metrics["Accuracy"]
    tb_logger.writer.add_hparams(parameters,
                                 {"hparam/test_accuracy": test_acc})

    tb_logger.close()

    return (mode, noise_fraction, as_pseudo_label, test_acc)
Beispiel #14
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    val_interval = getattr(config, "val_interval", 1)

    @trainer.on(Events.EPOCH_COMPLETED(every=val_interval))
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if config.num_epochs % val_interval != 0:
        trainer.add_event_handler(Events.COMPLETED, run_validation)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        if not exp_tracking.has_clearml:
            exp_tracking_logger = exp_tracking.setup_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                })

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (getattr(config, "val_interval", 1) *
                                        3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(
                event_filter=custom_event_filter),
        )

    # Log confusion matrix to ClearML:
    if exp_tracking.has_clearml:

        @trainer.on(Events.COMPLETED)
        def compute_and_log_cm():
            cm = cm_metric.compute()
            # CM: values are normalized such that diagonal values represent class recalls
            cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy()

            if idist.get_rank() == 0:
                try:
                    from clearml import Task
                except ImportError:
                    # Backwards-compatibility for legacy Trains SDK
                    from trains import Task

                clearml_logger = Task.current_task().get_logger()
                clearml_logger.report_confusion_matrix(
                    title="Final Confusion Matrix",
                    series="cm-preds-gt",
                    matrix=cm,
                    iteration=trainer.state.iteration,
                    xlabels=VOCSegmentationOpencv.target_names,
                    ylabels=VOCSegmentationOpencv.target_names,
                )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        if not exp_tracking.has_clearml:
            exp_tracking_logger.close()
Beispiel #15
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    if not hasattr(config, "prepare_batch"):
        config.prepare_batch = _prepare_batch

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=config.prepare_batch).attach(
                              trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    @trainer.on(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1))
        | Events.COMPLETED)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        config.output_path.as_posix(),
        evaluator,
        model=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = exp_tracking.setup_logging(trainer,
                                                         optimizer,
                                                         evaluators={
                                                             "training":
                                                             train_evaluator,
                                                             "validation":
                                                             evaluator
                                                         })

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2),
        )

        tb_logger.attach(
            train_evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="training"),
            event_name=Events.ITERATION_COMPLETED(
                once=len(train_eval_loader) // 2),
        )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
def training(local_rank, config):

    config["device"] = "cuda" if config["active_gpu_ids"] else "cpu"

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)

    device = idist.device()

    logger = setup_logger(name="Carbon Black Semantic Segmentation Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]

    if rank == 0:
        if config["stop_iteration"] is None:
            now = utils.get_time_stamp()
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = (
            f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_"
            f"backend-{idist.backend()}-{idist.get_world_size()}_{now}")

        output_path = Path(output_path) / folder_name
        output_path.mkdir(parents=True, exist_ok=True)
        config["output_path"] = output_path.as_posix()
        config["task_name"] = output_path.stem

        logger.info(f"Output path: {output_path}")

        if "cuda" in idist.device().type:
            config["cuda_device_name"] = torch.cuda.get_device_name(local_rank)

        setup_trains_logging(config)

    dataloader_train, dataloader_val = get_dataloaders(config)

    config["num_iterations_per_epoch"] = len(dataloader_train)
    config["num_epochs"] = round(config["num_iterations"] /
                                 config["num_iterations_per_epoch"])
    model = modeling.get_model(config)

    optimizer = get_optimizer(model, config)
    loss = get_loss()

    lr_scheduler = get_lr_scheduler(optimizer, config)

    trainer = create_trainer(model, optimizer, loss, lr_scheduler,
                             dataloader_train.sampler, config, logger)

    metrics = get_metrics(loss)

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    evaluator_train = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": evaluator_train, "validation": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

        example_prediction_logger = ExamplePredictionLogger(
            tb_logger, model, device)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = evaluator_train.run(dataloader_train)
        data_subset = "Train"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)

        state = evaluator.run(dataloader_val)
        data_subset = "Val"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)
        example_prediction_logger.log_visualization(dataloader_val.dataset,
                                                    epoch)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="validation",
    )

    # TODO: Add early stopping

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    # noinspection PyBroadException
    try:
        trainer.run(dataloader_train, max_epochs=config["num_epochs"])
    except Exception:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        # noinspection PyUnboundLocalVariable
        tb_logger.close()
Beispiel #17
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="ImageNet-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_imagenet_dataloader(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_supervised_trainer(model, optimizer, criterion,
                                        lr_scheduler, train_loader.sampler,
                                        config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(
                trainer.state.iteration))
            trainer.terminate()

    @trainer.on(Events.ITERATION_COMPLETED(every=20))
    def print_acc(engine):
        if rank == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\
                    .format(engine.state.epoch, engine.state.iteration, len(train_loader),
                            engine.state.saved_batch_loss
                            ))

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Beispiel #18
0
def create_trainer(
    train_step,
    output_names,
    model,
    ema_model,
    optimizer,
    lr_scheduler,
    supervised_train_loader,
    test_loader,
    cfg,
    logger,
    cta=None,
    unsup_train_loader=None,
    cta_probe_loader=None,
):

    trainer = Engine(train_step)
    trainer.logger = logger

    output_path = os.getcwd()

    to_save = {
        "model": model,
        "ema_model": ema_model,
        "optimizer": optimizer,
        "trainer": trainer,
        "lr_scheduler": lr_scheduler,
    }
    if cta is not None:
        to_save["cta"] = cta

    common.setup_common_training_handlers(
        trainer,
        train_sampler=supervised_train_loader.sampler,
        to_save=to_save,
        save_every_iters=cfg.solver.checkpoint_every,
        output_path=output_path,
        output_names=output_names,
        lr_scheduler=lr_scheduler,
        with_pbars=False,
        clear_cuda_cache=False,
    )

    ProgressBar(persist=False).attach(
        trainer, metric_names="all", event_name=Events.ITERATION_COMPLETED
    )

    unsupervised_train_loader_iter = None
    if unsup_train_loader is not None:
        unsupervised_train_loader_iter = cycle(unsup_train_loader)

    cta_probe_loader_iter = None
    if cta_probe_loader is not None:
        cta_probe_loader_iter = cycle(cta_probe_loader)

    # Setup handler to prepare data batches
    @trainer.on(Events.ITERATION_STARTED)
    def prepare_batch(e):
        sup_batch = e.state.batch
        e.state.batch = {
            "sup_batch": sup_batch,
        }
        if unsupervised_train_loader_iter is not None:
            unsup_batch = next(unsupervised_train_loader_iter)
            e.state.batch["unsup_batch"] = unsup_batch

        if cta_probe_loader_iter is not None:
            cta_probe_batch = next(cta_probe_loader_iter)
            cta_probe_batch["policy"] = [
                deserialize(p) for p in cta_probe_batch["policy"]
            ]
            e.state.batch["cta_probe_batch"] = cta_probe_batch

    # Setup handler to update EMA model
    @trainer.on(Events.ITERATION_COMPLETED, cfg.ema_decay)
    def update_ema_model(ema_decay):
        # EMA on parametes
        for ema_param, param in zip(ema_model.parameters(), model.parameters()):
            ema_param.data.mul_(ema_decay).add_(param.data, alpha=1.0 - ema_decay)

    # Setup handlers for debugging
    if cfg.debug:

        @trainer.on(Events.STARTED | Events.ITERATION_COMPLETED(every=100))
        @idist.one_rank_only()
        def log_weights_norms():
            wn = []
            ema_wn = []
            for ema_param, param in zip(ema_model.parameters(), model.parameters()):
                wn.append(torch.mean(param.data))
                ema_wn.append(torch.mean(ema_param.data))

            msg = "\n\nWeights norms"
            msg += "\n- Raw model: {}".format(
                to_list_str(torch.tensor(wn[:10] + wn[-10:]))
            )
            msg += "\n- EMA model: {}\n".format(
                to_list_str(torch.tensor(ema_wn[:10] + ema_wn[-10:]))
            )
            logger.info(msg)

            rmn = []
            rvar = []
            ema_rmn = []
            ema_rvar = []
            for m1, m2 in zip(model.modules(), ema_model.modules()):
                if isinstance(m1, nn.BatchNorm2d) and isinstance(m2, nn.BatchNorm2d):
                    rmn.append(torch.mean(m1.running_mean))
                    rvar.append(torch.mean(m1.running_var))
                    ema_rmn.append(torch.mean(m2.running_mean))
                    ema_rvar.append(torch.mean(m2.running_var))

            msg = "\n\nBN buffers"
            msg += "\n- Raw mean: {}".format(to_list_str(torch.tensor(rmn[:10])))
            msg += "\n- Raw var: {}".format(to_list_str(torch.tensor(rvar[:10])))
            msg += "\n- EMA mean: {}".format(to_list_str(torch.tensor(ema_rmn[:10])))
            msg += "\n- EMA var: {}\n".format(to_list_str(torch.tensor(ema_rvar[:10])))
            logger.info(msg)

        # TODO: Need to inspect a bug
        # if idist.get_rank() == 0:
        #     from ignite.contrib.handlers import ProgressBar
        #
        #     profiler = BasicTimeProfiler()
        #     profiler.attach(trainer)
        #
        #     @trainer.on(Events.ITERATION_COMPLETED(every=200))
        #     def log_profiling(_):
        #         results = profiler.get_results()
        #         profiler.print_results(results)

    # Setup validation engine
    metrics = {
        "accuracy": Accuracy(),
    }

    if not (idist.has_xla_support and idist.backend() == idist.xla.XLA_TPU):
        metrics.update({
            "precision": Precision(average=False),
            "recall": Recall(average=False),
        })

    eval_kwargs = dict(
        metrics=metrics,
        prepare_batch=sup_prepare_batch,
        device=idist.device(),
        non_blocking=True,
    )

    evaluator = create_supervised_evaluator(model, **eval_kwargs)
    ema_evaluator = create_supervised_evaluator(ema_model, **eval_kwargs)

    def log_results(epoch, max_epochs, metrics, ema_metrics):
        msg1 = "\n".join(
            ["\t{:16s}: {}".format(k, to_list_str(v)) for k, v in metrics.items()]
        )
        msg2 = "\n".join(
            ["\t{:16s}: {}".format(k, to_list_str(v)) for k, v in ema_metrics.items()]
        )
        logger.info(
            "\nEpoch {}/{}\nRaw:\n{}\nEMA:\n{}\n".format(epoch, max_epochs, msg1, msg2)
        )
        if cta is not None:
            logger.info("\n" + stats(cta))

    @trainer.on(
        Events.EPOCH_COMPLETED(every=cfg.solver.validate_every)
        | Events.STARTED
        | Events.COMPLETED
    )
    def run_evaluation():
        evaluator.run(test_loader)
        ema_evaluator.run(test_loader)
        log_results(
            trainer.state.epoch,
            trainer.state.max_epochs,
            evaluator.state.metrics,
            ema_evaluator.state.metrics,
        )

    # setup TB logging
    if idist.get_rank() == 0:
        tb_logger = common.setup_tb_logging(
            output_path,
            trainer,
            optimizers=optimizer,
            evaluators={"validation": evaluator, "ema validation": ema_evaluator},
            log_every_iters=15,
        )
        if cfg.online_exp_tracking.wandb:
            from ignite.contrib.handlers import WandBLogger

            wb_dir = Path("/tmp/output-fixmatch-wandb")
            if not wb_dir.exists():
                wb_dir.mkdir()

            _ = WandBLogger(
                project="fixmatch-pytorch",
                name=cfg.name,
                config=cfg,
                sync_tensorboard=True,
                dir=wb_dir.as_posix(),
                reinit=True,
            )

    resume_from = cfg.solver.resume_from
    if resume_from is not None:
        resume_from = list(Path(resume_from).rglob("training_checkpoint*.pt*"))
        if len(resume_from) > 0:
            # get latest
            checkpoint_fp = max(resume_from, key=lambda p: p.stat().st_mtime)
            assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(
                checkpoint_fp.as_posix()
            )
            logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
            checkpoint = torch.load(checkpoint_fp.as_posix())
            Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    @trainer.on(Events.COMPLETED)
    def release_all_resources():
        nonlocal unsupervised_train_loader_iter, cta_probe_loader_iter

        if idist.get_rank() == 0:
            tb_logger.close()

        if unsupervised_train_loader_iter is not None:
            unsupervised_train_loader_iter = None

        if cta_probe_loader_iter is not None:
            cta_probe_loader_iter = None

    return trainer
def get_logger(
    config: Any,
    trainer: Engine,
    evaluator: Optional[Union[Engine, Dict[str, Engine]]] = None,
    optimizers: Optional[Union[Optimizer, Dict[str, Optimizer]]] = None,
    **kwargs: Any,
) -> Optional[BaseLogger]:
    """Get Ignite provided logger.

    Parameters
    ----------
    config
        Config object for setting up loggers

    `config` has to contain
    - `filepath`: logging path to output file
    - `logger_log_every_iters`: logging iteration interval for loggers

    trainer
        trainer engine
    evaluator
        evaluator engine
    optimizers
        optimizers to log optimizer parameters
    kwargs
        optional keyword arguments passed to the logger

    Returns
    -------
    logger_handler
        Ignite provided logger instance
    """

    {% if logger_deps == 'clearml' %}
    logger_handler = common.setup_clearml_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'mlflow' %}
    logger_handler = common.setup_mlflow_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'neptune-client' %}
    logger_handler = common.setup_neptune_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'polyaxon-client' %}
    logger_handler = common.setup_plx_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'tensorboard' %}
    logger_handler = common.setup_tb_logging(
        output_path=config.output_dir,
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'visdom' %}
    logger_handler = common.setup_visdom_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% elif logger_deps == 'wandb' %}
    logger_handler = common.setup_wandb_logging(
        trainer=trainer,
        optimizers=optimizers,
        evaluators=evaluator,
        log_every_iters=config.logger_log_every_iters,
        **kwargs,
    )
    {% else %}
    logger_handler = None
    {% endif %}
    return logger_handler