Ejemplo n.º 1
0
def initialize(
    config: Optional[Any], num_channels: int
) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]:
    """Initializing model, optimizer, loss function, and lr scheduler
    with correct settings.

    Parameters
    ----------
    config
        config object
    num_channels
        number of channels for Generator

    Returns
    -------
    model, optimizer, loss_fn, lr_scheduler
    """
    netG = idist.auto_model(Generator(config.z_dim, config.g_filters, num_channels))
    netD = idist.auto_model(Discriminator(num_channels, config.d_filters))
    loss_fn = nn.BCELoss()
    optimizerG = optim.Adam(netG.parameters(), lr=config.lr, betas=(config.beta_1, 0.999))
    optimizerD = optim.Adam(netD.parameters(), lr=config.lr, betas=(config.beta_1, 0.999))
    loss_fn = loss_fn.to(idist.device())

    return netD, netG, optimizerD, optimizerG, loss_fn, None
    def _init_distribution(self):
        self.rank = idist.get_rank()
        manual_seed(42 + self.rank)
        self.device = idist.device()

        if self.train_ds:
            if self.train_ds.sampler is not None:
                sampler = self.train_ds.sampler(self.train_ds,
                                                self.train_ds.get_label)
                isShuffle = False
            else:
                sampler = None
                isShuffle = True
            self.train_loader = idist.auto_dataloader(
                self.train_ds,
                batch_size=self.hparams.train_bs,
                num_workers=self.hparams.train_num_workers,
                shuffle=isShuffle,
                drop_last=True,
                sampler=sampler,
                **self.train_ds.additional_loader_params)

        if self.valid_ds:
            self.valid_loader = idist.auto_dataloader(
                self.valid_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.valid_ds.additional_loader_params)

        if self.test_ds:
            self.test_loader = idist.auto_dataloader(
                self.test_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.test_ds.additional_loader_params)

        if USE_AMP:
            self._init_optimizer()
            self.model = idist.auto_model(self.model)
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O1")
        else:
            self.model = idist.auto_model(self.model)

        if not USE_AMP:
            self._init_optimizer()

        self.optimizer = idist.auto_optim(self.optimizer)

        self._init_scheduler()

        self.criterion = self.criterion.to(self.device)
Ejemplo n.º 3
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed backend if provided
    model = idist.auto_model(model)

    optimizer = utils.get_optimizer(
        config["optimizer"],
        model,
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    # Adapt optimizer for distributed backend if provided
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Ejemplo n.º 4
0
def initialize(config):

    device = idist.device()

    model = config.model.to(device)
    optimizer = config.optimizer

    # Adapt model to dist config
    model = idist.auto_model(model)

    if idist.backend() == "horovod":
        accumulation_steps = config.get("accumulation_steps", 1)
        # Can not use auto_optim with Horovod: https://github.com/horovod/horovod/issues/2670
        import horovod.torch as hvd

        optimizer = hvd.DistributedOptimizer(
            optimizer,
            named_parameters=model.named_parameters(),
            backward_passes_per_step=accumulation_steps,
        )
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)
        if accumulation_steps > 1:
            # disable manual grads accumulation as it is already done on optimizer's side
            config.accumulation_steps = 1
    else:
        optimizer = idist.auto_optim(optimizer)
    criterion = config.criterion.to(device)

    return model, optimizer, criterion
Ejemplo n.º 5
0
def get_model(config):
    """For a list of possible architectures, encoders and weights, please refer to:
    https://github.com/qubvel/segmentation_models.pytorch#architectures-
    """
    assert config[
        "architecture"] in smp.__dict__, f"Unknown architecture: {config['architecture']}"

    model = smp.__dict__[config["architecture"]](
        encoder_name=config["encoder"],
        encoder_weights=config["encoder_weights"],
        classes=1,
        activation="sigmoid",
    )

    if "encoder_freeze_at" in config:
        freeze_encoder_at(model.encoder, config["encoder_freeze_at"])

    if "device" in config:
        if config["device"] == "cpu":
            return model

    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    return model
Ejemplo n.º 6
0
def initialize(config):
    model = utils.get_model(config["model"], config["model_dir"],
                            config["dropout"], config["n_fc"],
                            config["num_classes"])

    config["learning_rate"] *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.BCEWithLogitsLoss()

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Ejemplo n.º 7
0
def initialize(
    config: ConfigSchema, wlm: WeakLabelManager
) -> Tuple[nn.Module, Optimizer, nn.Module]:
    model = get_model(config.model)
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    to_decay, not_to_deacy = [], []
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        elif len(param.shape) == 1 or name.endswith("bias"):
            not_to_deacy.append(param)
        else:
            to_decay.append(param)
    optimizer = optim.SGD(
        [
            {"params": to_decay, "weight_decay": config.weight_decay},
            {"params": not_to_deacy, "weight_decay": 0.0},
        ],
        lr=config.learning_rate,
        momentum=config.momentum,
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = get_weak_label_loss(config, wlm).to(idist.device())

    return model, optimizer, criterion
Ejemplo n.º 8
0
def _test_ema_final_weight(model, device, ddp=False, interval=1):
    """Test if final smoothed weights are correct"""
    if isinstance(device, str):
        device = torch.device(device)
    model = model.to(device)
    if ddp:
        model = idist.auto_model(model)
    step_fn = _get_dummy_step_fn(model)
    engine = Engine(step_fn)

    # momentum will be constantly 0.5
    ema_handler = EMAHandler(model, momentum_warmup=0.5, momentum=0.5, warmup_iters=1)
    ema_handler.attach(engine, "model", event=Events.ITERATION_COMPLETED(every=interval))

    # engine will run 4 iterations
    engine.run(range(2), max_epochs=2)

    ema_weight = ema_handler.ema_model.weight.data
    model_weight = model.weight.data
    assert ema_weight.device == device
    assert model_weight.device == device
    if interval == 1:
        torch.testing.assert_allclose(ema_weight, torch.full((1, 2), 4.0625, device=device))
    elif interval == 2:
        torch.testing.assert_allclose(ema_weight, torch.full((1, 2), 3.5, device=device))
    else:
        pass
    torch.testing.assert_allclose(model_weight, torch.full((1, 2), 5.0, device=device))
Ejemplo n.º 9
0
def _test_ema_final_weight(model, device=None, ddp=False, interval=1):
    """Test if final smoothed weights are correct"""
    if device is None:
        # let horovod decide the device
        device = idist.device()
    if isinstance(device, str):
        device = torch.device(device)
    model = model.to(device)
    if ddp:
        model = idist.auto_model(model)
    step_fn = _get_dummy_step_fn(model)
    engine = Engine(step_fn)

    ema_handler = EMAHandler(model, momentum=0.5)
    ema_handler.attach(engine,
                       "model",
                       event=Events.ITERATION_COMPLETED(every=interval))

    # engine will run 4 iterations
    engine.run(range(2), max_epochs=2)

    # ema_model and model can be DP or DDP
    # explicitly cast to float32 to avoid test failure on XLA devices
    ema_weight = _unwrap_model(ema_handler.ema_model).weight.data.to(
        torch.float32)
    model_weight = _unwrap_model(model).weight.data.to(torch.float32)
    assert ema_weight.device == device
    assert model_weight.device == device
    if interval == 1:
        assert ema_weight.allclose(ema_weight.new_full((1, 2), 4.0625))
    elif interval == 2:
        assert ema_weight.allclose(ema_weight.new_full((1, 2), 3.5))
    else:
        pass
    assert model_weight.allclose(model_weight.new_full((1, 2), 5.0))
Ejemplo n.º 10
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model, find_unused_parameters=True)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Ejemplo n.º 11
0
def evaluation(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    device = idist.device()
    manual_seed(config.seed + local_rank)

    data_loader = config.data_loader
    model = config.model.to(device)

    # Load weights:
    state_dict = get_model_weights(config, logger, with_clearml)
    model.load_state_dict(state_dict)

    # Adapt model to dist config
    model = idist.auto_model(model)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model,
                                 val_metrics,
                                 config,
                                 with_clearml,
                                 tag="val")

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.TensorboardLogger(
            log_dir=config.output_path.as_posix())
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.COMPLETED,
            tag="validation",
            metric_names="all",
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm,
                                    cm_metric, evaluator.state.iteration)

    state = evaluator.run(data_loader)
    utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation",
                      state.metrics)

    if idist.get_rank() == 0:
        tb_logger.close()
Ejemplo n.º 12
0
def test_ema_ema_model_on_cuda(get_dummy_model):
    """Test if ema_handler.ema_model is nn.Module and under eval mode"""
    model = get_dummy_model().to(idist.device())
    model = idist.auto_model(model)
    ema_handler = EMAHandler(model)
    ema_model = ema_handler.ema_model
    assert (
        isinstance(ema_model, nn.Module)
        and not isinstance(ema_model, nn.parallel.DistributedDataParallel)
        and not isinstance(ema_model, nn.parallel.DataParallel)
    )
    assert not ema_model.training
Ejemplo n.º 13
0
def initialize(config):

    model = config.model.to(config.device)
    optimizer = config.optimizer
    # Setup Nvidia/Apex AMP
    model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=1)

    # Adapt model to dist conf
    model = idist.auto_model(model)

    criterion = config.criterion.to(config.device)

    return model, optimizer, criterion
Ejemplo n.º 14
0
def initialize(config):
    model = get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("learning_rate", 0.1),
        momentum=config.get("momentum", 0.9),
        weight_decay=config.get("weight_decay", 1e-5),
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    lr_scheduler = StepLR(optimizer, step_size=le, gamma=0.9)

    return model, optimizer, criterion, lr_scheduler
Ejemplo n.º 15
0
def initialize(config):
    model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes)

    config.learning_rate *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = nn.BCEWithLogitsLoss()

    le = config.num_iters_per_epoch
    milestones_values = [
        (0, 0.0),
        (le * config.num_warmup_epochs, config.learning_rate),
        (le * config.max_epochs, 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    return model, optimizer, loss_fn, lr_scheduler
Ejemplo n.º 16
0
def initialize(config: Optional[Any]) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]:
    """Initializing model, optimizer, loss function, and lr scheduler
    with correct settings.

    Parameters
    ----------
    config:
        config object

    Returns
    -------
    model, optimizer, loss_fn, lr_scheduler
    """
    model = ...
    optimizer = ...
    loss_fn = ...
    lr_scheduler = ...
    model = idist.auto_model(model)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = loss_fn.to(idist.device())

    return model, optimizer, loss_fn, lr_scheduler
Ejemplo n.º 17
0
def initialize(config):
    model = utils.get_model(config["model"], config["num_classes"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        #        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)

    # criterion = nn.CrossEntropyLoss().to(idist.device())
    criterion = nn.CrossEntropyLoss()

    le = config["num_iters_per_epoch"]
    cl = config["learning_rate"]
    # print("%d, %f" %(le,cl))
    milestones_values = [
        (30 * le, cl),
        (45 * le, 0.5 * cl),
        (46 * le, 0.1 * cl),
        (60 * le, 0.1 * cl),
        (61 * le, 0.01 * cl),
        (90 * le, 0.01 * cl),
        (120 * le, 0.001 * cl),
        # (le * config["num_warmup_epochs"], config["learning_rate"]),
        # (le * config["num_epochs"], 0.0),
    ]
    # print(milestones_values)
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"])

    return model, optimizer, criterion, lr_scheduler
Ejemplo n.º 18
0
def initialize(cfg):
    model = setup_model(cfg.model, num_classes=cfg.num_classes)
    ema_model = setup_model(cfg.model, num_classes=cfg.num_classes)

    model.to(idist.device())
    ema_model.to(idist.device())
    setup_ema(ema_model, model)

    model = idist.auto_model(model)

    if isinstance(model, nn.parallel.DataParallel):
        ema_model = nn.parallel.DataParallel(ema_model)

    optimizer = instantiate(cfg.solver.optimizer, model.parameters())
    optimizer = idist.auto_optim(optimizer)

    sup_criterion = instantiate(cfg.solver.supervised_criterion)

    total_num_iters = cfg.solver.num_epochs * cfg.solver.epoch_length
    lr_scheduler = instantiate(cfg.solver.lr_scheduler,
                               optimizer,
                               T_max=total_num_iters)

    return model, ema_model, optimizer, sup_criterion, lr_scheduler
Ejemplo n.º 19
0
def training(rank, config):
    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    # Define output folder:
    config.output = "/tmp/output"

    model = idist.auto_model(config.model)
    optimizer = idist.auto_optim(config.optimizer)
    criterion = config.criterion

    train_set, val_set = config.train_set, config.val_set
    train_loader = idist.auto_dataloader(train_set,
                                         batch_size=config.train_batch_size)
    val_loader = idist.auto_dataloader(val_set,
                                       batch_size=config.val_batch_size)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval))
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=config.output)

        tb_logger.attach_output_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            tag="training",
            output_transform=lambda loss: {"batchloss": loss},
            metric_names="all",
        )

        for tag, evaluator in [("training", train_evaluator),
                               ("validation", validation_evaluator)]:
            tb_logger.attach_output_handler(
                evaluator,
                event_name=Events.EPOCH_COMPLETED,
                tag=tag,
                metric_names=["loss", "accuracy"],
                global_step_transform=global_step_from_engine(trainer),
            )

        tb_logger.attach_opt_params_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            optimizer=optimizer)

    model_checkpoint = ModelCheckpoint(
        config.output,
        n_saved=2,
        filename_prefix="best",
        score_name="accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 20
0
def train(loop: Loop, config: Config):
    seed_everything(22)
    setup_cudnn_reproducibility(True)

    dataloader, num_channels = get_dataloader(config)

    generator = auto_model(
        Generator(config.z_dim, config.g_filters, num_channels))
    discriminator = auto_model(Discriminator(num_channels, config.d_filters))

    bce = BCEWithLogitsLoss()

    opt_G = Adam(generator.parameters(),
                 lr=config.lr * idist.get_world_size(),
                 betas=(config.beta_1, 0.999))
    opt_D = Adam(discriminator.parameters(),
                 lr=config.lr * idist.get_world_size(),
                 betas=(config.beta_1, 0.999))

    device = idist.device()
    real_labels = torch.ones(config.batch_size, device=device)
    fake_labels = torch.zeros(config.batch_size, device=device)
    fixed_noise = torch.randn(16, config.z_dim, 1, 1, device=device)

    def dump_fake_images_to_tb():
        with loop.mode("valid"):
            fake = make_grid(generator(fixed_noise),
                             normalize=True,
                             range=(-1, 1)).cpu()

        if idist.get_rank() == 0:
            sw: SummaryWriter = get_summary_writer(loop)
            sw.add_image("fake_images",
                         fake,
                         global_step=loop.iterations.current_epoch)

    def get_noise():
        return torch.randn(config.batch_size,
                           config.z_dim,
                           1,
                           1,
                           device=device)

    error_D_avg = Average()
    error_G_avg = Average()

    loop.attach(generator=generator,
                discriminator=discriminator,
                d_opt=opt_D,
                g_opt=opt_G)

    def stage_1(loop: Loop):
        for _ in loop.iterate_epochs(config.epochs):

            for real, _ in loop.iterate_dataloader(dataloader, mode="train"):
                output = discriminator(real)
                error_D_real = bce(output, real_labels)
                loop.backward(error_D_real)

                fake = generator(get_noise())

                # train with fake
                output = discriminator(fake.detach())
                error_D_fake = bce(output, fake_labels)

                loop.backward(error_D_fake)
                loop.optimizer_step(opt_D)

                with torch.no_grad():
                    error_D = error_D_fake + error_D_real
                    error_D_avg.update(error_D)

                with no_grad_for_module(discriminator), module_eval(
                        discriminator):
                    # We don't want to compute grads for
                    # discriminator parameters on
                    # error_G backward pass
                    output = discriminator(fake)

                error_G = bce(output, real_labels)

                simple_gd_step(loop, opt_G, error_G)

                error_G_avg.update(error_G.detach())

                loop.metrics.log("generator/error_batch", error_G.item())
                loop.metrics.log("discriminator/error_batch", error_D.item())

            loop.metrics.consume("generator/error_epoch", error_G_avg)
            loop.metrics.consume("discriminator/error_epoch", error_D_avg)

            dump_fake_images_to_tb()

    loop.run(stage_1)
Ejemplo n.º 21
0
def run(
    local_rank: int,
    device: str,
    experiment_name: str,
    gpus: Optional[Union[int, List[int], str]] = None,
    dataset_root: str = "./dataset",
    log_dir: str = "./log",
    model: str = "fasterrcnn_resnet50_fpn",
    epochs: int = 13,
    batch_size: int = 4,
    lr: float = 0.01,
    download: bool = False,
    image_size: int = 256,
    resume_from: Optional[dict] = None,
) -> None:
    bbox_params = A.BboxParams(format="pascal_voc")
    train_transform = A.Compose(
        [A.HorizontalFlip(p=0.5), ToTensorV2()],
        bbox_params=bbox_params,
    )
    val_transform = A.Compose([ToTensorV2()], bbox_params=bbox_params)

    download = local_rank == 0 and download
    train_dataset = Dataset(root=dataset_root,
                            download=download,
                            image_set="train",
                            transforms=train_transform)
    val_dataset = Dataset(root=dataset_root,
                          download=download,
                          image_set="val",
                          transforms=val_transform)
    vis_dataset = Subset(val_dataset,
                         random.sample(range(len(val_dataset)), k=16))

    train_dataloader = idist.auto_dataloader(train_dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             collate_fn=collate_fn,
                                             num_workers=4)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                collate_fn=collate_fn,
                                num_workers=4)
    vis_dataloader = DataLoader(vis_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                collate_fn=collate_fn,
                                num_workers=4)

    model = idist.auto_model(model)
    scaler = GradScaler()
    optimizer = SGD(lr=lr, params=model.parameters())
    optimizer = idist.auto_optim(optimizer)
    scheduler = OneCycleLR(optimizer,
                           max_lr=lr,
                           total_steps=len(train_dataloader) * epochs)

    def update_model(engine, batch):
        model.train()
        images, targets = batch
        images = list(image.to(device) for image in images)
        targets = [{
            k: v.to(device)
            for k, v in t.items() if isinstance(v, torch.Tensor)
        } for t in targets]

        with torch.autocast(device, enabled=True):
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loss_items = {k: v.item() for k, v in loss_dict.items()}
        loss_items["loss_average"] = loss.item() / 4

        return loss_items

    @torch.no_grad()
    def inference(engine, batch):
        model.eval()
        images, targets = batch
        images = list(image.to(device) for image in images)
        outputs = model(images)
        outputs = [{k: v.to("cpu") for k, v in t.items()} for t in outputs]
        return {
            "y_pred": outputs,
            "y": targets,
            "x": [i.cpu() for i in images]
        }

    trainer = Engine(update_model)
    evaluator = Engine(inference)
    visualizer = Engine(inference)

    aim_logger = AimLogger(
        repo=os.path.join(log_dir, "aim"),
        experiment=experiment_name,
    )

    CocoMetric(convert_to_coco_api(val_dataset)).attach(evaluator, "mAP")

    @trainer.on(Events.EPOCH_COMPLETED)
    @one_rank_only()
    def log_validation_results(engine):
        evaluator.run(val_dataloader)
        visualizer.run(vis_dataloader)

    @trainer.on(Events.ITERATION_COMPLETED)
    def step_scheduler(engine):
        scheduler.step()
        aim_logger.log_metrics({"lr": scheduler.get_last_lr()[0]},
                               step=engine.state.iteration)

    @visualizer.on(Events.EPOCH_STARTED)
    def reset_vis_images(engine):
        engine.state.model_outputs = []

    @visualizer.on(Events.ITERATION_COMPLETED)
    def add_vis_images(engine):
        engine.state.model_outputs.append(engine.state.output)

    @visualizer.on(Events.ITERATION_COMPLETED)
    def submit_vis_images(engine):
        aim_images = []
        for outputs in engine.state.model_outputs:
            for image, target, pred in zip(outputs["x"], outputs["y"],
                                           outputs["y_pred"]):
                image = (image * 255).byte()
                pred_labels = [
                    Dataset.class2name[label.item()]
                    for label in pred["labels"]
                ]
                pred_boxes = pred["boxes"].long()
                image = draw_bounding_boxes(image,
                                            pred_boxes,
                                            pred_labels,
                                            colors="red")

                target_labels = [
                    Dataset.class2name[label.item()]
                    for label in target["labels"]
                ]
                target_boxes = target["boxes"].long()
                image = draw_bounding_boxes(image,
                                            target_boxes,
                                            target_labels,
                                            colors="green")

                aim_images.append(aim.Image(image.numpy().transpose(
                    (1, 2, 0))))
        aim_logger.experiment.track(aim_images,
                                    name="vis",
                                    step=trainer.state.epoch)

    losses = [
        "loss_classifier", "loss_box_reg", "loss_objectness",
        "loss_rpn_box_reg", "loss_average"
    ]
    for loss_name in losses:
        RunningAverage(output_transform=lambda x: x[loss_name]).attach(
            trainer, loss_name)
    ProgressBar().attach(trainer, losses)
    ProgressBar().attach(evaluator)

    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": scheduler,
        "scaler": scaler,
    }
    checkpoint = Checkpoint(
        to_save=objects_to_checkpoint,
        save_handler=DiskSaver(log_dir, require_empty=False),
        n_saved=3,
        score_name="mAP",
        global_step_transform=lambda *_: trainer.state.epoch,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint)
    if resume_from:
        Checkpoint.load_objects(objects_to_checkpoint, torch.load(resume_from))

    aim_logger.log_params({
        "lr": lr,
        "image_size": image_size,
        "batch_size": batch_size,
        "epochs": epochs,
    })
    aim_logger.attach_output_handler(trainer,
                                     event_name=Events.ITERATION_COMPLETED,
                                     tag="train",
                                     output_transform=lambda loss: loss)
    aim_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag="val",
        metric_names=["mAP"],
        global_step_transform=global_step_from_engine(
            trainer, Events.ITERATION_COMPLETED),
    )

    trainer.run(train_dataloader, max_epochs=epochs)
Ejemplo n.º 22
0
def training(rank, config):

    # Specific ignite.distributed
    print(
        idist.get_rank(),
        ": run with config:",
        config,
        "- backend=",
        idist.backend(),
        "- world size",
        idist.get_world_size(),
    )

    device = idist.device()

    # Data preparation:
    dataset = RndDataset(nb_samples=config["nb_samples"])

    # Specific ignite.distributed
    train_loader = idist.auto_dataloader(dataset,
                                         batch_size=config["batch_size"])

    # Model, criterion, optimizer setup
    model = idist.auto_model(wide_resnet50_2(num_classes=100))
    criterion = NLLLoss()
    optimizer = idist.auto_optim(SGD(model.parameters(), lr=0.01))

    # Training loop log param
    log_interval = config["log_interval"]

    def _train_step(engine, batch):

        data = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        output = model(data)
        # Add a softmax layer
        probabilities = torch.nn.functional.softmax(output, dim=0)

        loss_val = criterion(probabilities, target)
        loss_val.backward()
        optimizer.step()

        return loss_val

    # Running the _train_step function on whole batch_data iterable only once
    trainer = Engine(_train_step)

    # Add a logger
    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training():
        print("Process {}/{} Train Epoch: {} [{}/{}]\tLoss: {}".format(
            idist.get_rank(),
            idist.get_world_size(),
            trainer.state.epoch,
            trainer.state.iteration * len(trainer.state.batch[0]),
            len(dataset) / idist.get_world_size(),
            trainer.state.output,
        ))

    trainer.run(train_loader, max_epochs=1)
Ejemplo n.º 23
0
def run(loop: Loop):
    seed_everything(42)
    setup_cudnn_reproducibility(True, False)

    train_ds, valid_ds = get_train_test_datasets("data/cifar")

    model = auto_model(get_model())

    train_loader = auto_dataloader(
        train_ds,
        batch_size=512,
        shuffle=True,
        drop_last=True,
        num_workers=4,
    )

    valid_loader = auto_dataloader(
        valid_ds,
        batch_size=512,
        num_workers=4,
        shuffle=False,
    )

    optim = SGD(model.parameters(), lr=0.4, momentum=0.9)

    scheduler = OneCycleLR(optim,
                           max_lr=1,
                           epochs=NUM_EPOCHS,
                           steps_per_epoch=len(train_loader))
    criterion = CrossEntropyLoss()

    precision = Precision(average=False)
    recall = Recall(average=False)

    # Ignite metrics are combinable
    f1 = (precision * recall * 2 / (precision + recall)).mean()
    accuracy = Accuracy()

    # We are attaching metrics to automatically reset
    loop.attach(
        # Loop manages train/eval modes, device and requires_grad of attached `nn.Module`s
        criterion=criterion,
        # This criterion doesn't have any state or attribute tensors
        # So it's attachment doesn't introduce any behavior
        model=model,
        # Loop saves state of all attached objects having state_dict()/load_state_dict() methods
        # to checkpoints
        optimizer=optim,
        scheduler=scheduler,
    )

    def train(loop: Loop):
        for _ in loop.iterate_epochs(NUM_EPOCHS):
            for x, y in loop.iterate_dataloader(train_loader, mode="train"):
                y_pred_logits = model(x)

                loss: torch.Tensor = criterion(y_pred_logits, y)
                loop.backward(loss)
                # Makes optimizer step and also
                # zeroes grad after (default)
                loop.optimizer_step(optim, zero_grad=True)

                # Here we call scheduler.step() every iteration
                # because we have one-cycle scheduler
                # we also can call it after all dataloader loop
                # if it's som usual scheduler
                scheduler.step()

                # Log learning rate. All metrics are written to tensorboard
                # with specified names
                # If iteration='auto' (default) its determined based on where the call is
                # performed. Here it will be batches
                loop.metrics.log("lr",
                                 scheduler.get_last_lr()[0],
                                 iteration="auto")

            # Loop disables gradients and calls Module.eval() inside loop
            # for all attached modules when mode="valid" (default)
            for x, y in loop.iterate_dataloader(valid_loader, mode="valid"):
                y_pred_logits: torch.Tensor = model(x)

                y_pred = to_onehot(y_pred_logits.argmax(dim=-1),
                                   num_classes=10)

                precision.update((y_pred, y))
                recall.update((y_pred, y))
                accuracy.update((y_pred, y))

            # This metrics will be epoch metrics because they are called outside
            # dataloader loop
            # Here we logging metric without resetting it
            loop.metrics.log("valid/precision", precision.compute().mean())
            loop.metrics.log("valid/recall", recall.compute().mean())

            # .log() method above accepts values (tensors, floats, np.array's)
            # .consume() accepts Metric object. It resets it after logging
            loop.metrics.consume("valid/f1", f1)
            loop.metrics.consume("valid/accuracy", accuracy)

    loop.run(train)