Example #1
0
def make_progressive_loader(stage, model, conf):
    adapt = progressive_adaptive_regularization(
        stage,
        conf.training.epoch // conf.training.progressive.step,
        conf.training.progressive.train_sizes,
        conf.training.progressive.valid_sizes,
        conf.training.progressive.randaug_layers,
        conf.training.progressive.randaug_magnitudes,
        conf.training.progressive.mixups,
        conf.training.progressive.cutmixes,
        conf.training.progressive.dropouts,
        conf.training.progressive.drop_paths,
        conf.training.progressive.verbose,
    )
    train_set, valid_set = make_dataset(
        conf.dataset_path,
        adapt.train_size,
        adapt.valid_size,
        {
            "n_augment": adapt.randaug_layer,
            "magnitude": adapt.randaug_magnitude,
            "increasing": conf.training.randaug_increasing,
            "magnitude_std": conf.training.randaug_magnitude_std,
        },
        {
            "mixup": adapt.mixup,
            "cutmix": adapt.cutmix,
            "mix_before_aug": conf.training.mix_before_aug,
        },
    )

    try:
        model.set_dropout(adapt.dropout, adapt.drop_path)

    except:
        pass

    if conf.training.progressive.grad_accumulation is not None:
        grad_accum = conf.training.progressive.grad_accumulation[stage]

    else:
        grad_accum = conf.training.grad_accumulation

    batch_size = conf.training.dataloader.batch_size // grad_accum

    get_logger(
        mode=conf.logger).info(f"Using gradient accumulation {grad_accum}")

    train_loader, valid_loader, train_sampler = make_dataloader(
        train_set,
        valid_set,
        batch_size,
        conf.distributed,
        conf.training.dataloader.num_workers,
    )

    return train_loader, valid_loader, train_sampler, grad_accum
Example #2
0
def valid(conf, loader, model, criterion):
    device = "cuda"

    batch_time = Meter()
    losses = Meter()
    top1 = Meter()
    top5 = Meter()

    model.eval()

    logger = get_logger(mode=conf.logger)

    start = perf_counter()
    for i, (input, label) in enumerate(loader):
        input = input.to(device)
        label = label.to(device)

        out = model(input)
        loss = criterion(out, label)
        prec1, prec5 = accuracy(out, label, topk=(1, 5))
        batch = input.shape[0]

        loss_dict = {
            "prec1": prec1 * batch,
            "prec5": prec5 * batch,
            "loss": loss * batch,
            "batch": torch.tensor(batch, dtype=torch.float32).to(device),
        }
        loss_reduced = dist.reduce_dict(loss_dict, average=False)
        batch = loss_reduced["batch"].to(torch.int64).item()
        losses.update(loss_reduced["loss"].item() / batch, batch)
        top1.update(loss_reduced["prec1"].item() / batch, batch)
        top5.update(loss_reduced["prec5"].item() / batch, batch)

        batch_time.update(perf_counter() - start)
        start = perf_counter()

        if dist.is_primary() and i % conf.log_freq == 0:
            logger.info(
                f"valid: {i}/{len(loader)}; time: {batch_time.val:.3f} ({batch_time.avg:.3f}); "
                f"loss: {losses.val:.4f} ({losses.avg:.4f}); "
                f"prec@1: {top1.val:.3f} ({top1.avg:.3f}); "
                f"prec@5: {top5.val:.3f} ({top5.avg:.3f})")

    if dist.is_primary():
        logger.info(
            f"validation finished: prec@1 {top1.avg:.3f}, prec@5 {top5.avg:.3f}"
        )

    return top1.avg, top5.avg, losses
def progressive_adaptive_regularization(
    stage,
    max_stage,
    train_sizes,
    valid_sizes,
    randaug_layers,
    randaug_magnitudes,
    mixups,
    cutmixes,
    dropouts,
    drop_paths,
    verbose=True,
):
    train_size = int(lerp(*train_sizes, stage, max_stage))
    valid_size = int(lerp(*valid_sizes, stage, max_stage))
    randaug_layer = int(lerp(*randaug_layers, stage, max_stage))
    randaug_magnitude = lerp(*randaug_magnitudes, stage, max_stage)
    mixup = lerp(*mixups, stage, max_stage)
    cutmix = lerp(*cutmixes, stage, max_stage)
    dropout = lerp(*dropouts, stage, max_stage)
    drop_path = lerp(*drop_paths, stage, max_stage)

    if verbose:
        logger = get_logger()
        log = f"""Progressive Training with Adaptive Regularization
Stage: {stage + 1} / {max_stage}
Image Size: train={train_size}, valid={valid_size}
RandAugment: n_augment={randaug_layer}, magnitude={randaug_magnitude}
Mixup: {mixup}, Cutmix: {cutmix}, Dropout={dropout}, DropPath={drop_path}"""
        logger.info(log)

    return SimpleNamespace(
        train_size=train_size,
        valid_size=valid_size,
        randaug_layer=randaug_layer,
        randaug_magnitude=randaug_magnitude,
        mixup=mixup,
        cutmix=cutmix,
        dropout=dropout,
        drop_path=drop_path,
    )
Example #4
0
def main(conf):
    device = "cuda"
    conf.distributed = conf.n_gpu > 1
    torch.backends.cudnn.benchmark = True

    logger = get_logger(mode=conf.logger)
    logger.info(conf.dict())

    model = conf.arch.make().to(device)
    model_ema = conf.arch.make().to(device)

    logger.info(model)

    if conf.distributed:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[dist.get_local_rank()],
            output_device=dist.get_local_rank(),
        )
        model_module = model.module

        accumulate(model_ema, model_module, 0)

    else:
        model_module = model
        accumulate(model_ema, model, 0)

    grad_accum = conf.training.grad_accumulation

    if conf.training.progressive.step > 0:
        progressive_stage = 0
        train_loader, valid_loader, train_sampler, grad_accum = make_progressive_loader(
            progressive_stage, model_module, conf)

    else:
        train_set, valid_set = make_dataset(
            conf.dataset_path,
            conf.training.train_size,
            conf.training.valid_size,
            {
                "n_augment": conf.training.randaug_layer,
                "magnitude": conf.training.randaug_magnitude,
                "increasing": conf.training.randaug_increasing,
                "magnitude_std": conf.training.randaug_magnitude_std,
                "cutout": conf.training.randaug_cutout,
            },
            {
                "mixup": conf.training.mixup,
                "cutmix": conf.training.cutmix,
                "mix_before_aug": conf.training.mix_before_aug,
            },
            conf.training.erasing,
        )

        batch_size = conf.training.dataloader.batch_size // grad_accum

        train_loader, valid_loader, train_sampler = make_dataloader(
            train_set,
            valid_set,
            batch_size,
            conf.distributed,
            conf.training.dataloader.num_workers,
        )

    criterion_train = MixLoss(eps=0.1)
    criterion_valid = nn.CrossEntropyLoss()

    parameters, names = add_weight_decay(
        model.named_parameters(),
        conf.training.weight_decay,
        wd_skip_fn(conf.training.wd_skip),
    )

    optimizer = make_optimizer(conf.training, parameters)
    epoch_len = math.ceil(len(train_loader) / grad_accum)
    scheduler = make_scheduler(conf.training, optimizer, epoch_len)

    step = 0

    scaler = amp.GradScaler(enabled=conf.fp16)

    checker = conf.checker.make()

    for epoch in range(conf.training.epoch):
        if conf.distributed:
            train_sampler.set_epoch(epoch)

        train(
            conf,
            step,
            epoch,
            train_loader,
            model,
            model_ema,
            criterion_train,
            optimizer,
            scheduler,
            scaler,
            grad_accum,
        )
        step += epoch_len

        if conf.training.ema == 0:
            prec1, prec5, losses = valid(conf, valid_loader, model_module,
                                         criterion_valid)

        else:
            prec1, prec5, losses = valid(conf, valid_loader, model_ema,
                                         criterion_valid)

        checker.log(
            step=epoch + 1,
            prec1=prec1,
            prec5=prec5,
            loss=losses.avg,
            lr=optimizer.param_groups[0]["lr"],
        )
        try:
            checker.checkpoint(
                {
                    "model": model_module.state_dict(),
                    "ema": model_ema.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "conf": conf.dict(),
                },
                f"epoch-{str(epoch + 1).zfill(3)}.pt",
            )
        except Exception as e:
            print(e)

        if (conf.training.progressive.step > 0
                and (epoch + 1) % conf.training.progressive.step == 0):
            progressive_stage += 1

            if (progressive_stage <
                    conf.training.epoch // conf.training.progressive.step):
                train_loader, valid_loader, train_sampler, grad_accum = make_progressive_loader(
                    progressive_stage, model_module, conf)
Example #5
0
def train(
    conf,
    step,
    epoch,
    loader,
    model,
    model_ema,
    criterion,
    optimizer,
    scheduler,
    scaler,
    grad_accum,
):
    device = "cuda"

    batch_time = Meter()
    data_time = Meter()
    losses = Meter()
    top1 = Meter()
    top5 = Meter()

    model.train()

    agc_params = [
        p[1] for p in model.named_parameters() if "linear" not in p[0]
    ]
    params = list(model.parameters())

    logger = get_logger(mode=conf.logger)

    start = perf_counter()
    for i, (input, label1, label2, ratio) in enumerate(loader):
        # measure data loading time
        input = input.to(device)
        label1 = label1.to(device)
        label2 = label2.to(device)
        ratio = ratio.to(device=device, dtype=torch.float32)
        data_time.update(perf_counter() - start)

        with amp.autocast(enabled=conf.fp16):
            out = model(input)
            loss = criterion(out, label1, label2, ratio) / grad_accum

        prec1, prec5 = accuracy(out, label1, topk=(1, 5))
        batch = input.shape[0]
        losses.update(loss.item() * grad_accum, batch)
        top1.update(prec1.item(), batch)
        top5.update(prec5.item(), batch)

        scaler.scale(loss).backward()

        if ((i + 1) % grad_accum == 0) or (i + 1) == len(loader):
            if conf.training.agc > 0 or conf.training.clip_grad_norm > 0:
                if conf.fp16:
                    scaler.unscale_(optimizer)

                if conf.training.agc > 0:
                    adaptive_grad_clip(agc_params, conf.training.agc)

                if conf.training.clip_grad_norm > 0:
                    nn.utils.clip_grad_norm_(params,
                                             conf.training.clip_grad_norm)

            scheduler.step()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

        # optimizer.step()
        t = step + i

        if conf.training.ema > 0:
            if conf.distributed:
                model_module = model.module

            else:
                model_module = model

            accumulate(
                model_ema,
                model_module,
                min(conf.training.ema, (1 + t) / (10 + t)),
                ema_bn=conf.training.ema_bn,
            )

        batch_time.update(perf_counter() - start)
        start = perf_counter()

        if dist.is_primary() and i % conf.log_freq == 0:
            lr = optimizer.param_groups[0]["lr"]

            logger.info(
                f"epoch: {epoch} ({i}/{len(loader)}); time: {batch_time.val:.3f} ({batch_time.avg:.2f}); "
                f"data: {data_time.val:.3f} ({data_time.avg:.2f}); "
                f"loss: {losses.val:.3f} ({losses.avg:.3f}); "
                f"prec@1: {top1.val:.2f} ({top1.avg:.2f}); "
                f"prec@5: {top5.val:.2f} ({top5.avg:.2f})")

    return losses
Example #6
0
    def __init__(self, formatter=None):
        if formatter is None:
            formatter = default_formatter

        self.logger = get_logger()
        self.formatter = formatter
def make_dataset(
    path, train_size, valid_size, randaug_params, mix_params, erasing, verbose=True
):
    train_dir = os.path.join(nsml.DATASET_PATH, path, "train.lmdb")
    valid_dir = os.path.join(nsml.DATASET_PATH, path, "valid.lmdb")

    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
    )

    transform_list = [
        transforms.RandomResizedCrop(train_size, interpolation=Image.BICUBIC),
        transforms.RandomHorizontalFlip(),
        RandAugment(**randaug_params),
        transforms.ToTensor(),
        normalize,
    ]

    if erasing > 0:
        transform_list += [
            RandomErasing(
                erasing, mode="pixel", max_count=1, num_splits=0, device="cpu"
            )
        ]

    if mix_params["mix_before_aug"]:
        preprocess = transform_list[:2]
        postprocess = transform_list[2:]

    else:
        preprocess = transform_list
        postprocess = []

    if verbose:
        logger = get_logger()

        log = f"""Transforms
Transform before Mixes:
{preprocess}
Mixes: mixup={mix_params["mixup"]}, cutmix={mix_params["cutmix"]}"""

        if mix_params["mix_before_aug"]:
            log += f"""
Transform after Mixes:
{postprocess}"""

        logger.info(log)

    train_preprocess = transforms.Compose(preprocess)
    train_postprocess = transforms.Compose(postprocess)

    train_set = LMDBDataset(train_dir, train_preprocess)
    train_set = MixDataset(
        train_set, train_postprocess, mix_params["mixup"], mix_params["cutmix"]
    )

    valid_preprocess = transforms.Compose(
        [
            transforms.Resize(valid_size + 32, interpolation=Image.BICUBIC),
            transforms.CenterCrop(valid_size),
            transforms.ToTensor(),
            normalize,
        ]
    )

    valid_set = LMDBDataset(valid_dir, valid_preprocess)

    return train_set, valid_set
def main(conf):
    device = "cuda"
    conf.distributed = conf.n_gpu > 1
    torch.backends.cudnn.benchmark = True

    logger = get_logger(mode=conf.logger)
    logger.info(conf.dict())

    student = conf.arch.make().to(device)
    student.set_drop_path(conf.task.student_drop_path)
    teacher = conf.arch.make().to(device)

    logger.info(student)

    if conf.distributed:
        teacher = nn.parallel.DistributedDataParallel(
            teacher,
            device_ids=[dist.get_local_rank()],
            output_device=dist.get_local_rank(),
        )
        student = nn.parallel.DistributedDataParallel(
            student,
            device_ids=[dist.get_local_rank()],
            output_device=dist.get_local_rank(),
        )
        teacher_module = teacher.module
        student_module = student.module

        teacher_module.load_state_dict(student_module.state_dict())

    else:
        teacher_module = teacher
        student_module = student

        teacher_module.load_state_dict(student.state_dict())

    for p in teacher.parameters():
        p.requires_grad = False

    grad_accum = conf.training.grad_accumulation

    train_set, valid_set = make_augment_dataset(
        conf.dataset_path,
        DINOAugment(
            conf.task.global_crop_size,
            conf.task.local_crop_size,
            conf.task.global_crop_scale,
            conf.task.local_crop_scale,
            conf.task.n_local_crop,
        ),
        None,
    )

    batch_size = conf.training.dataloader.batch_size // grad_accum

    train_loader, valid_loader, train_sampler = make_dataloader(
        train_set,
        valid_set,
        batch_size,
        conf.distributed,
        conf.training.dataloader.num_workers,
    )

    criterion_train = DINOLoss(
        conf.arch.dim_head_out,
        conf.task.n_local_crop + 2,
        conf.task.warmup_teacher_temperature,
        conf.task.teacher_temperature,
        conf.task.warmup_teacher_temperature_epoch,
        conf.training.epoch,
    ).to(device)

    parameters, names = add_weight_decay(
        student.named_parameters(),
        conf.training.weight_decay,
        wd_skip_fn(conf.training.wd_skip),
    )

    def make_scheduler(train_conf, optimizer, epoch_len):
        warmup = train_conf.scheduler.warmup * epoch_len
        n_iter = epoch_len * train_conf.epoch
        lr = train_conf.base_lr * train_conf.dataloader.batch_size / 256

        if train_conf.scheduler.type == "exp_epoch":
            return train_conf.scheduler.make(optimizer,
                                             epoch_len,
                                             lr=lr,
                                             max_iter=train_conf.epoch,
                                             warmup=warmup)

        else:
            return train_conf.scheduler.make(optimizer,
                                             lr=lr,
                                             n_iter=n_iter,
                                             warmup=warmup)

    optimizer = make_optimizer(conf.training, parameters)
    epoch_len = math.ceil(len(train_loader) / grad_accum)
    scheduler = make_scheduler(conf.training, optimizer, epoch_len)
    wd_schedule = cosine_schedule(
        conf.training.weight_decay,
        conf.task.weight_decay_end,
        epoch_len * conf.training.epoch,
    )
    momentum_schedule = cosine_schedule(conf.task.teacher_momentum, 1,
                                        epoch_len * conf.training.epoch)

    scaler = amp.GradScaler(enabled=conf.fp16)

    checker = conf.checker.make()

    step = 0

    for epoch in range(conf.training.epoch):
        if conf.distributed:
            train_sampler.set_epoch(epoch)

        train(
            conf,
            step,
            epoch,
            train_loader,
            teacher,
            student,
            criterion_train,
            optimizer,
            scheduler,
            wd_schedule,
            momentum_schedule,
            scaler,
            grad_accum,
            checker,
        )
        step += epoch_len

        try:
            checker.checkpoint(
                {
                    "student": student_module.state_dict(),
                    "teacher": teacher_module.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "conf": conf.dict(),
                },
                f"epoch-{str(epoch + 1).zfill(3)}.pt",
            )

        except Exception as e:
            print(e)
def train(
    conf,
    step,
    epoch,
    loader,
    teacher,
    student,
    criterion,
    optimizer,
    scheduler,
    wd_schedule,
    momentum_schedule,
    scaler,
    grad_accum,
    checker,
):
    device = "cuda"

    batch_time = Meter()
    data_time = Meter()
    losses = Meter()

    student.train()

    agc_params = [
        p[1] for p in student.named_parameters() if "linear" not in p[0]
    ]
    params = list(student.parameters())

    logger = get_logger(mode=conf.logger)

    start = perf_counter()
    for i, (inputs, _) in enumerate(loader):
        # measure data loading time
        inputs = [i.to(device) for i in inputs]
        data_time.update(perf_counter() - start)

        with amp.autocast(enabled=conf.fp16):
            with torch.no_grad():
                teacher_out = teacher(inputs[:2])

            student_out = student(inputs)

            loss = criterion(student_out, teacher_out, epoch) / grad_accum

        losses.update(loss.item() * grad_accum, inputs[0].shape[0])

        scaler.scale(loss).backward()

        for param_group in optimizer.param_groups:
            if "no_decay" not in param_group:
                param_group["weight_decay"] = wd_schedule[step]

        if ((i + 1) % grad_accum == 0) or (i + 1) == len(loader):
            if conf.training.agc > 0 or conf.training.clip_grad_norm > 0:
                if conf.fp16:
                    scaler.unscale_(optimizer)

                if conf.training.agc > 0:
                    adaptive_grad_clip(agc_params, conf.training.agc)

                if conf.training.clip_grad_norm > 0:
                    nn.utils.clip_grad_norm_(params,
                                             conf.training.clip_grad_norm)

            cancel_last_layer_grad(epoch, student, conf.task.freeze_last_layer)

            scheduler.step()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

            with torch.no_grad():
                m = momentum_schedule[step]

                for param_q, param_k in zip(student.parameters(),
                                            teacher.parameters()):
                    param_k.detach().mul_(m).add_(param_q.detach(),
                                                  alpha=1 - m)

        batch_time.update(perf_counter() - start)
        start = perf_counter()

        if dist.is_primary() and i % conf.log_freq == 0:
            lr = optimizer.param_groups[0]["lr"]
            """logger.info(
                f"epoch: {epoch} ({i}/{len(loader)}); time: {batch_time.val:.3f} ({batch_time.avg:.2f}); "
                f"data: {data_time.val:.3f} ({data_time.avg:.2f}); "
                f"loss: {losses.val:.3f} ({losses.avg:.3f}); "
                f"lr: {lr:.5f}; "
                f"wd: {wd_schedule[step]:4f}; "
                f"moment: {momentum_schedule[step]:.4f}"
            )"""

            checker.log(
                step=step,
                weight_decay=wd_schedule[step],
                momentum=momentum_schedule[step],
                loss=losses.avg,
                lr=optimizer.param_groups[0]["lr"],
            )

        step += 1

    return losses