Example #1
0
def main(args):

    # Lets cuDNN benchmark conv implementations and choose the fastest.
    # Only good if sizes stay the same within the main loop!
    torch.backends.cudnn.benchmark = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #classes = 5

    #valid_set, valid_loader = mkval(args)

    model = models.KNOWN_MODELS[args.model](head_size=args.classes,
                                            zero_head=False)
    model = torch.nn.DataParallel(model)
    checkpoint = torch.load(args.weight_path, map_location=device)
    model.load_state_dict(checkpoint["model"])

    # Optionally resume from a checkpoint.
    # Load it to CPU first as we'll move the model to GPU later.
    # This way, we save a little bit of GPU memory when loading.

    # Note: no weight-decay!

    model = model.to(device)

    model.eval()

    chrono = lb.Chrono()
    #run_eval(model, valid_loader, device, chrono, step='end')
    end = time.time()
    val_tx = tv.transforms.Compose([
        tv.transforms.Resize((448, 448)),
        tv.transforms.ToTensor(),
        tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    run_predict(model, args.datadir, val_tx, device)
def main(args):

    best_acc = -1

    logger = bit_common.setup_logger(args)
    cp, cn = smooth_BCE(eps=0.1)
    # Lets cuDNN benchmark conv implementations and choose the fastest.
    # Only good if sizes stay the same within the main loop!
    torch.backends.cudnn.benchmark = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Going to train on {device}")

    classes = 5

    train_set, valid_set, train_loader, valid_loader = mktrainval(args, logger)
    logger.info(f"Loading model from {args.model}.npz")
    #model = models.KNOWN_MODELS[args.model](head_size=classes, zero_head=True)
    #model.load_from(np.load(f"{args.model}.npz"))

    model = EfficientNet.from_pretrained(args.model, num_classes=classes)
    logger.info("Moving model onto all GPUs")
    model = torch.nn.DataParallel(model)

    # Optionally resume from a checkpoint.
    # Load it to CPU first as we'll move the model to GPU later.
    # This way, we save a little bit of GPU memory when loading.
    start_epoch = 0

    # Note: no weight-decay!
    optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)

    # Resume fine-tuning if we find a saved model.
    savename = pjoin(args.logdir, args.name, "bit.pth.tar")
    try:
        logger.info(f"Model will be saved in '{savename}'")
        checkpoint = torch.load(savename, map_location="cpu")
        logger.info(f"Found saved model to resume from at '{savename}'")

        start_epoch = checkpoint["epoch"]
        model.load_state_dict(checkpoint["model"])
        optim.load_state_dict(checkpoint["optim"])
        logger.info(f"Resumed at epoch {start_epoch}")
    except FileNotFoundError:
        logger.info("Fine-tuning from BiT")

    model = model.to(device)
    optim.zero_grad()

    model.train()
    mixup = bit_hyperrule.get_mixup(len(train_set))
    #mixup = -1
    cri = torch.nn.CrossEntropyLoss().to(device)
    #cri = FocalLoss(cri)
    logger.info("Starting training!")
    chrono = lb.Chrono()
    accum_steps = 0
    mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1
    end = time.time()

    epoches = 10
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optim,
                                                    max_lr=0.01,
                                                    steps_per_epoch=1,
                                                    epochs=epoches)

    with lb.Uninterrupt() as u:
        for epoch in range(start_epoch, epoches):

            pbar = enumerate(train_loader)
            pbar = tqdm.tqdm(pbar, total=len(train_loader))

            scheduler.step()
            all_top1, all_top5 = [], []
            for param_group in optim.param_groups:
                lr = param_group["lr"]
            #for x, y in recycle(train_loader):
            for batch_id, (x, y) in pbar:
                #for batch_id, (x, y) in enumerate(train_loader):
                # measure data loading time, which is spent in the `for` statement.
                chrono._done("load", time.time() - end)

                if u.interrupted:
                    break

                # Schedule sending to GPU(s)
                x = x.to(device, non_blocking=True)
                y = y.to(device, non_blocking=True)

                # Update learning-rate, including stop training if over.
                #lr = bit_hyperrule.get_lr(step, len(train_set), args.base_lr)
                #if lr is None:
                #  break

                if mixup > 0.0:
                    x, y_a, y_b = mixup_data(x, y, mixup_l)

                # compute output
                with chrono.measure("fprop"):
                    logits = model(x)
                    top1, top5 = topk(logits, y, ks=(1, 5))
                    all_top1.extend(top1.cpu())
                    all_top5.extend(top5.cpu())
                    if mixup > 0.0:
                        c = mixup_criterion(cri, logits, y_a, y_b, mixup_l)
                    else:
                        c = cri(logits, y)
                train_loss = c.item()
                train_acc = np.mean(all_top1) * 100.0
                # Accumulate grads
                with chrono.measure("grads"):
                    (c / args.batch_split).backward()
                    accum_steps += 1
                accstep = f"({accum_steps}/{args.batch_split})" if args.batch_split > 1 else ""
                s = f"epoch={epoch} batch {batch_id}{accstep}: loss={train_loss:.5f} train_acc={train_acc:.2f} lr={lr:.1e}"
                #s = f"epoch={epoch} batch {batch_id}{accstep}: loss={c.item():.5f} lr={lr:.1e}"
                pbar.set_description(s)
                #logger.info(f"[batch {batch_id}{accstep}]: loss={c_num:.5f} (lr={lr:.1e})")  # pylint: disable=logging-format-interpolation
                logger.flush()

                # Update params
                with chrono.measure("update"):
                    optim.step()
                    optim.zero_grad()
                # Sample new mixup ratio for next batch
                mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1

            # Run evaluation and save the model.
            val_loss, val_acc = run_eval(model, valid_loader, device, chrono,
                                         logger, epoch)

            best = val_acc > best_acc
            if best:
                best_acc = val_acc
                torch.save(
                    {
                        "epoch": epoch,
                        "val_loss": val_loss,
                        "val_acc": val_acc,
                        "train_acc": train_acc,
                        "model": model.state_dict(),
                        "optim": optim.state_dict(),
                    }, savename)
            end = time.time()

    logger.info(f"Timings:\n{chrono}")
Example #3
0
def main(args):
    logger = common.setup_logger(args)

    # Lets cuDNN benchmark conv implementations and choose the fastest.
    # Only good if sizes stay the same within the main loop!
    torch.backends.cudnn.benchmark = True

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger.info(f"Going to train on {device}")

    train_set, valid_set, train_loader, valid_loader = mktrainval(args, logger)

    logger.info(f"Loading model from {args.model}.npz")
    model = models.KNOWN_MODELS[args.model](head_size=len(valid_set.classes),
                                            zero_head=True)
    model.load_from(
        np.load(os.path.join(args.pretrained_dir, f"{args.model}.npz")))

    logger.info("Moving model onto all GPUs")
    model = torch.nn.DataParallel(model)

    # Optionally resume from a checkpoint.
    # Load it to CPU first as we'll move the model to GPU later.
    # This way, we save a little bit of GPU memory when loading.
    step = 0

    # Note: no weight-decay!
    optim = torch.optim.SGD(model.parameters(), lr=args.base_lr, momentum=0.9)

    writer = SummaryWriter(os.path.join(args.logdir, args.name))

    # Resume fine-tuning if we find a saved model.
    savename = pjoin(args.logdir, args.name, "model.tar")
    try:
        logger.info(f"Model will be saved in '{savename}'")
        checkpoint = torch.load(savename, map_location="cpu")
        logger.info(f"Found saved model to resume from at '{savename}'")

        step = checkpoint["step"]
        model.load_state_dict(checkpoint["model"])
        optim.load_state_dict(checkpoint["optim"])
        logger.info(f"Resumed at step {step}")
    except FileNotFoundError:
        logger.info("Fine-tuning from BiT")

    model = model.to(device)
    optim.zero_grad()

    model.train()
    mixup = hyperrule.get_mixup(len(train_set))
    cri = torch.nn.CrossEntropyLoss().to(device)

    logger.info("Starting training!")
    chrono = lb.Chrono()
    accum_steps = 0
    mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1
    end = time.time()

    with lb.Uninterrupt() as u:
        for x, y in recycle(train_loader):
            # measure data loading time, which is spent in the `for` statement.
            chrono._done("load", time.time() - end)

            if u.interrupted:
                break

            # Schedule sending to GPU(s)
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            # Update learning-rate, including stop training if over.
            lr = hyperrule.get_lr(step, len(train_set), args.base_lr)
            if lr is None:
                break
            for param_group in optim.param_groups:
                param_group["lr"] = lr

            if mixup > 0.0:
                x, y_a, y_b = mixup_data(x, y, mixup_l)

            # compute output
            with chrono.measure("fprop"):
                logits = model(x)
                if mixup > 0.0:
                    c = mixup_criterion(cri, logits, y_a, y_b, mixup_l)
                else:
                    c = cri(logits, y)
                c_num = float(
                    c.data.cpu().numpy())  # Also ensures a sync point.

            # Accumulate grads
            with chrono.measure("grads"):
                (c / args.batch_split).backward()
                accum_steps += 1

            accstep = f" ({accum_steps}/{args.batch_split})" if args.batch_split > 1 else ""
            logger.info(
                f"[step {step}{accstep}]: loss={c_num:.5f} (lr={lr:.1e})")  # pylint: disable=logging-format-interpolation
            logger.flush()
            writer.add_scalar('Train/loss', c_num, step)
            writer.add_scalar('Train/lr', lr, step)

            # Update params
            if accum_steps == args.batch_split:
                with chrono.measure("update"):
                    optim.step()
                    optim.zero_grad()
                step += 1
                accum_steps = 0
                # Sample new mixup ratio for next batch
                mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1

                # Run evaluation and save the model.
                if args.eval_every and step % args.eval_every == 0:
                    run_eval(model, valid_loader, device, chrono, logger,
                             writer, step)
                if args.save and step % args.save_every == 0:
                    step_savename = pjoin(args.logdir, args.name,
                                          "model_" + str(step) + ".tar")
                    torch.save(
                        {
                            "step": step,
                            "model": model.state_dict(),
                            "optim": optim.state_dict()
                        }, step_savename)

            end = time.time()

        # Final eval at end of training.
        run_eval(model, valid_loader, device, chrono, logger, writer, step)

    logger.info(f"Timings:\n{chrono}")