Exemple #1
0
def main(args):
    global step, epoch, result_dict

    if args.checkpoint is not None:
        print("#############  Get Old Args   ##############")
        model, epoch, step, old_args = Checkpoint.restore(args.checkpoint)

        args.model = old_args.model
        args.model_input = old_args.model_input
        args.viewpoint = old_args.viewpoint
        args.rot_rep = old_args.rot_rep
        args.feat_dim = old_args.feat_dim
        args.voxel_dim = old_args.voxel_dim
        args.small_decoder = old_args.small_decoder
        args.reconstruction = old_args.reconstruction
        args.unet_output = old_args.unet_output
        args.depth_sculpt = old_args.depth_sculpt
        args.best_loss = old_args.best_loss
    else:
        model = get_model(args.model, args.rot_rep, args.model_input,
                          args.unet_output, args.pretrained,
                          args.no_refinement)
    print(args)

    loader = get_loaders(
        name=args.dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        split=args.split,
        rot_rep=args.rot_rep,
        n_views=args.n_views,
        corrupt_vp=args.corrupt_vp,
    )

    loader.dataset.__getitem__(0)
    # initialize result dictionaries
    logable_metrics, printable_metrics = get_metrics(
        args.reconstruction,
        args.viewpoint,
        args.realism_check,
        args.unet_output,
    )
    result_dict = ResultDict(loader.dataset, logable_metrics,
                             printable_metrics)

    # Train on GPU
    model.cuda()

    print("#############  Start Evaluation   ##############")
    eval_step(model, loader, args.split)
Exemple #2
0
def main(eval_args):
    # ensures that weight initializations are all the same
    logging = utils.Logger(eval_args.local_rank, eval_args.save)

    # load a checkpoint
    logging.info('loading the model at:')
    logging.info(eval_args.checkpoint)
    checkpoint = torch.load(eval_args.checkpoint, map_location='cpu')
    args = checkpoint['args']

    if not hasattr(args, 'ada_groups'):
        logging.info('old model, no ada groups was found.')
        args.ada_groups = False

    if not hasattr(args, 'min_groups_per_scale'):
        logging.info('old model, no min_groups_per_scale was found.')
        args.min_groups_per_scale = 1

    if not hasattr(args, 'num_mixture_dec'):
        logging.info('old model, no num_mixture_dec was found.')
        args.num_mixture_dec = 10

    logging.info('loaded the model at epoch %d', checkpoint['epoch'])
    arch_instance = utils.get_arch_cells(args.arch_instance)
    model = AutoEncoder(args, None, arch_instance)
    # Loading is not strict because of self.weight_normalized in Conv2D class in neural_operations. This variable
    # is only used for computing the spectral normalization and it is safe not to load it. Some of our earlier models
    # did not have this variable.
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    model = model.cuda()

    logging.info('args = %s', args)
    logging.info('num conv layers: %d', len(model.all_conv_layers))
    logging.info('param size = %fM ', utils.count_parameters_in_M(model))

    if eval_args.eval_mode == 'evaluate':
        # load train valid queue
        args.data = eval_args.data
        train_queue, valid_queue, num_classes = datasets.get_loaders(args)

        if eval_args.eval_on_train:
            logging.info('Using the training data for eval.')
            valid_queue = train_queue

        # get number of bits
        num_output = utils.num_output(args.dataset)
        bpd_coeff = 1. / np.log(2.) / num_output

        valid_neg_log_p, valid_nelbo = test(
            valid_queue,
            model,
            num_samples=eval_args.num_iw_samples,
            args=args,
            logging=logging)
        logging.info('final valid nelbo %f', valid_nelbo)
        logging.info('final valid neg log p %f', valid_neg_log_p)
        logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff)
        logging.info('final valid neg log p in bpd %f',
                     valid_neg_log_p * bpd_coeff)

    else:
        bn_eval_mode = not eval_args.readjust_bn
        num_samples = 16
        with torch.no_grad():
            n = int(np.floor(np.sqrt(num_samples)))
            set_bn(model,
                   bn_eval_mode,
                   num_samples=36,
                   t=eval_args.temp,
                   iter=500)
            for ind in range(10):  # sampling is repeated.
                torch.cuda.synchronize()
                start = time()
                with autocast():
                    logits = model.sample(num_samples, eval_args.temp)
                output = model.decoder_output(logits)
                output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \
                    else output.sample()
                torch.cuda.synchronize()
                end = time()

                output_tiled = utils.tile_image(output_img,
                                                n).cpu().numpy().transpose(
                                                    1, 2, 0)
                logging.info('sampling time per batch: %0.3f sec',
                             (end - start))
                output_tiled = np.asarray(output_tiled * 255, dtype=np.uint8)
                output_tiled = np.squeeze(output_tiled)

                plt.imshow(output_tiled)
                plt.show()
Exemple #3
0
def main(args):
    # ensures that weight initializations are all the same
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    logging = utils.Logger(args.global_rank, args.save)
    writer = utils.Writer(args.global_rank, args.save)

    # Get data loaders.
    train_queue, valid_queue, num_classes, _ = datasets.get_loaders(args)
    args.num_total_iter = len(train_queue) * args.epochs
    warmup_iters = len(train_queue) * args.warmup_epochs
    swa_start = len(train_queue) * (args.epochs - 1)

    arch_instance = utils.get_arch_cells(args.arch_instance)

    model = AutoEncoder(args, writer, arch_instance)
    model = model.cuda()

    logging.info('args = %s', args)
    logging.info('param size = %fM ', utils.count_parameters_in_M(model))
    logging.info('groups per scale: %s, total_groups: %d',
                 model.groups_per_scale, sum(model.groups_per_scale))

    if args.fast_adamax:
        # Fast adamax has the same functionality as torch.optim.Adamax, except it is faster.
        cnn_optimizer = Adamax(model.parameters(),
                               args.learning_rate,
                               weight_decay=args.weight_decay,
                               eps=1e-3)
    else:
        cnn_optimizer = torch.optim.Adamax(model.parameters(),
                                           args.learning_rate,
                                           weight_decay=args.weight_decay,
                                           eps=1e-3)

    cnn_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        cnn_optimizer,
        float(args.epochs - args.warmup_epochs - 1),
        eta_min=args.learning_rate_min)
    grad_scalar = GradScaler(2**10)

    num_output = utils.num_output(args.dataset, args)
    bpd_coeff = 1. / np.log(2.) / num_output

    # if load
    checkpoint_file = os.path.join(args.save, 'checkpoint.pt')
    if args.cont_training:
        logging.info('loading the model.')
        checkpoint = torch.load(checkpoint_file, map_location='cpu')
        init_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        model = model.cuda()
        cnn_optimizer.load_state_dict(checkpoint['optimizer'])
        grad_scalar.load_state_dict(checkpoint['grad_scalar'])
        cnn_scheduler.load_state_dict(checkpoint['scheduler'])
        global_step = checkpoint['global_step']
    else:
        global_step, init_epoch = 0, 0

    for epoch in range(init_epoch, args.epochs):
        # update lrs.
        if args.distributed:
            train_queue.sampler.set_epoch(global_step + args.seed)
            valid_queue.sampler.set_epoch(0)

        if epoch > args.warmup_epochs:
            cnn_scheduler.step()

        # Logging.
        logging.info('epoch %d', epoch)

        # Training.
        train_nelbo, global_step = train(train_queue, model, cnn_optimizer,
                                         grad_scalar, global_step,
                                         warmup_iters, writer, logging)
        logging.info('train_nelbo %f', train_nelbo)
        writer.add_scalar('train/nelbo', train_nelbo, global_step)

        model.eval()
        # generate samples less frequently
        eval_freq = 1 if args.epochs <= 50 else 20
        if epoch % eval_freq == 0 or epoch == (args.epochs - 1):
            with torch.no_grad():
                num_samples = 16
                n = int(np.floor(np.sqrt(num_samples)))
                for t in [0.7, 0.8, 0.9, 1.0]:
                    logits = model.sample(num_samples, t)
                    output = model.decoder_output(logits)
                    output_img = output.mean if isinstance(
                        output, torch.distributions.bernoulli.Bernoulli
                    ) else output.sample(t)
                    output_tiled = utils.tile_image(output_img, n)
                    writer.add_image('generated_%0.1f' % t, output_tiled,
                                     global_step)

            valid_neg_log_p, valid_nelbo = test(valid_queue,
                                                model,
                                                num_samples=10,
                                                args=args,
                                                logging=logging)
            logging.info('valid_nelbo %f', valid_nelbo)
            logging.info('valid neg log p %f', valid_neg_log_p)
            logging.info('valid bpd elbo %f', valid_nelbo * bpd_coeff)
            logging.info('valid bpd log p %f', valid_neg_log_p * bpd_coeff)
            writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch)
            writer.add_scalar('val/nelbo', valid_nelbo, epoch)
            writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff,
                              epoch)
            writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch)

        save_freq = int(np.ceil(args.epochs / 100))
        if epoch % save_freq == 0 or epoch == (args.epochs - 1):
            if args.global_rank == 0:
                logging.info('saving the model.')
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': cnn_optimizer.state_dict(),
                        'global_step': global_step,
                        'args': args,
                        'arch_instance': arch_instance,
                        'scheduler': cnn_scheduler.state_dict(),
                        'grad_scalar': grad_scalar.state_dict()
                    }, checkpoint_file)

    # Final validation
    valid_neg_log_p, valid_nelbo = test(valid_queue,
                                        model,
                                        num_samples=1000,
                                        args=args,
                                        logging=logging)
    logging.info('final valid nelbo %f', valid_nelbo)
    logging.info('final valid neg log p %f', valid_neg_log_p)
    writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch + 1)
    writer.add_scalar('val/nelbo', valid_nelbo, epoch + 1)
    writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch + 1)
    writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch + 1)
    writer.close()
Exemple #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode", type=str, default="train")
    parser.add_argument("--model", type=str, default="mobilenet_v2")
    parser.add_argument("--dataset", type=str, default="cifar10")
    parser.add_argument("--dataroot", type=str, default="/tmp/data")
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--n_epochs", type=int, default=100)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--n_gpus", type=int, default=1)
    parser.add_argument("--checkpoint", type=str, default="/tmp/chkpt.pth.tar")
    parser.add_argument("--save_every", type=int, default=10)
    parser.add_argument("--pretrained", type=str, default=None)
    args = parser.parse_args()
    print(args)

    if torch.cuda.is_available():
        print("cuda is available, use cuda")
        device = torch.device("cuda")
    else:
        print("cuda is not available, use cpu")
        device = torch.device("cpu")

    print("download dataset: {}".format(args.dataset))
    train_loader, test_loader, n_classes = get_loaders(
        dataset=args.dataset, root=args.dataroot, batch_size=args.batch_size)

    print("build model: {}".format(args.model))
    if args.model == "mobilenet":
        from models import MobileNet
        model = MobileNet(n_classes=n_classes)
    elif args.model == "mobilenet_v2":
        from models import MobileNet_v2
        model = MobileNet_v2(n_classes=n_classes)
    elif args.model == "shufflenet":
        from models import ShuffleNet
        model = ShuffleNet(n_classes=n_classes)
    elif args.model == "shufflenet_v2":
        from models import ShuffleNet_v2
        model = ShuffleNet_v2(n_classes=n_classes)
    elif args.model == "squeezenet":
        from models import SqueezeNet
        model = SqueezeNet(n_classes=n_classes)
    else:
        raise NotImplementedError

    model = model.to(device)
    if args.pretrained:
        model.load_state_dict(torch.load(args.checkpoint))

    if args.n_gpus > 1:
        gpus = []
        for i in range(args.n_gpus):
            gpus.append(i)
        model = nn.DataParallel(model, device_ids=gpus)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss()

    if args.mode == "train":
        for epoch in range(args.n_epochs):
            train(epoch, model, optimizer, criterion, train_loader, device)
            if (epoch + 1) % args.save_every == 0:
                print("saving model...")
                torch.save(the_model.state_dict(), args.checkpoint)
    elif args.mode == "test":
        test(model, criterion, test_loader, device)
    else:
        raise NotImplementedError
Exemple #5
0
def experiment(logdir: str, device: str):
    tb_logdir = logdir / "tensorboard"

    seed_all()
    model = SimpleNet().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    train_loader, valid_loader = get_loaders("")

    with TensorboardLogger(tb_logdir) as tb:
        stage = "stage0"
        n_epochs = 10

        checkpointer = CheckpointManager(
            logdir=logdir / stage,
            metric="accuracy",
            metric_minimization=False,
            save_n_best=3,
        )

        for ep in range(1, n_epochs + 1):
            print(f"[Epoch {ep}/{n_epochs}]")
            train_loss, train_acc = train_fn(
                model, train_loader, device, criterion, optimizer
            )
            valid_loss, valid_acc = valid_fn(model, valid_loader, device, criterion)

            # log metrics
            tb.metric(f"{stage}/loss", {"train": train_loss, "valid": valid_loss}, ep)
            tb.metric(
                f"{stage}/accuracy", {"train": train_acc, "valid": valid_acc}, ep,
            )

            epoch_metrics = {
                "train_loss": train_loss,
                "train_accuracy": train_acc,
                "valid_loss": valid_loss,
                "valid_accuracy": valid_acc,
            }

            # store checkpoints
            checkpointer.process(
                score=valid_acc,
                epoch=ep,
                checkpoint=make_checkpoint(
                    stage, ep, model, optimizer, metrics=epoch_metrics,
                ),
            )

            print()
            print(f"            train loss - {train_loss:.5f}")
            print(f"train dataset accuracy - {train_acc:.5f}")
            print(f"            valid loss - {valid_loss:.5f}")
            print(f"valid dataset accuracy - {valid_acc:.5f}")
            print()

        # do a next training stage
        stage = "stage1"
        n_epochs = 10
        print(f"\n\nStage - {stage}")

        checkpointer = CheckpointManager(
            logdir=logdir / stage,
            metric="accuracy",
            metric_minimization=False,
            save_n_best=3,
        )

        load_checkpoint(logdir / "stage0" / "best.pth", model)
        optimizer = optim.Adam(model.parameters(), lr=1e-4 / 2)

        for ep in range(1, n_epochs + 1):
            print(f"[Epoch {ep}/{n_epochs}]")
            train_loss, train_acc = train_fn(
                model, train_loader, device, criterion, optimizer
            )
            valid_loss, valid_acc = valid_fn(model, valid_loader, device, criterion)

            # log metrics
            tb.metric(f"{stage}/loss", {"train": train_loss, "valid": valid_loss}, ep)
            tb.metric(
                f"{stage}/accuracy", {"train": train_acc, "valid": valid_acc}, ep,
            )

            epoch_metrics = {
                "train_loss": train_loss,
                "train_accuracy": train_acc,
                "valid_loss": valid_loss,
                "valid_accuracy": valid_acc,
            }

            # store checkpoints
            checkpointer.process(
                score=valid_acc,
                epoch=ep,
                checkpoint=make_checkpoint(
                    stage, ep, model, optimizer, metrics=epoch_metrics,
                ),
            )

            print()
            print(f"            train loss - {train_loss:.5f}")
            print(f"train dataset accuracy - {train_acc:.5f}")
            print(f"            valid loss - {valid_loss:.5f}")
            print(f"valid dataset accuracy - {valid_acc:.5f}")
            print()

        load_checkpoint(logdir / "stage1" / "best.pth", model)
Exemple #6
0
def experiment(rank, world_size, logdir):
    """Experiment flow.

    Args:
        rank (int): process rank
        world_size (int): world size
        logdir (pathlib.Path): directory with logs
    """
    # preparations
    torch.cuda.set_device(rank)
    setup(rank, world_size)
    logdir = Path(logdir) if isinstance(logdir, str) else logdir
    tb_logdir = logdir / "tensorboard"

    main_metric = "accuracy"
    minimize_metric = False

    def log(text):
        if rank == 0:
            print(text)

    train_loader, valid_loader = get_loaders("", rank, world_size)
    world_setup = (rank, world_size)

    train_batch_cnt = 0
    valid_batch_cnt = 0

    with TensorboardLogger(str(tb_logdir), write_to_disk=(rank == 0)) as tb:
        stage = "stage0"
        n_epochs = 2
        log(f"Stage - {stage}")

        seed_all()
        model = SimpleNet()
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        log("Used sync batchnorm")

        model = model.to(rank)
        model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
        optimizer = optim.AdamW(model.parameters(), lr=1e-3)
        criterion = nn.CrossEntropyLoss()

        checkpointer = CheckpointManager(
            logdir=logdir / stage,
            metric=main_metric,
            metric_minimization=minimize_metric,
            save_n_best=3,
        )

        for ep in range(1, n_epochs + 1):
            log(f"[Epoch {ep}/{n_epochs}]")
            train_metrics = train_fn(
                model,
                train_loader,
                world_setup,
                criterion,
                optimizer,
                tb_logger=tb,
                last_iteration_index=train_batch_cnt,
            )
            if rank == 0:
                tb.add_scalars(f"{stage}/train", train_metrics, ep)
            train_batch_cnt += len(train_loader)

            valid_metrics = valid_fn(
                model,
                valid_loader,
                world_setup,
                criterion,
                tb_logger=tb,
                last_iteration_index=valid_batch_cnt,
            )
            valid_batch_cnt += len(valid_loader)
            if rank == 0:
                tb.add_scalars(f"{stage}/valid", valid_metrics, ep)

                # store checkpoints
                checkpointer.process(
                    score=valid_metrics[main_metric],
                    epoch=ep,
                    checkpoint=make_checkpoint(
                        stage,
                        ep,
                        model,
                        optimizer,
                        metrics={
                            "train": train_metrics,
                            "valid": valid_metrics
                        },
                    ),
                )

            log("[{}/{}] train: loss - {}, accuracy - {}".format(
                ep, n_epochs, train_metrics["loss"],
                train_metrics["accuracy"]))
            log("[{}/{}] valid: loss - {}, accuracy - {}".format(
                ep, n_epochs, valid_metrics["loss"],
                valid_metrics["accuracy"]))

        # do a next training stage
        stage = "stage1"
        n_epochs = 3
        log("*" * 100)
        log(f"Stage - {stage}")

        # wait other processes
        dist.barrier()

        model = SimpleNet()
        load_checkpoint(logdir / "stage0" / "best.pth", model, verbose=True)
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

        model = model.to(rank)
        model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
        optimizer = optim.Adam(model.parameters(), lr=1e-4 / 2)

        checkpointer = CheckpointManager(
            logdir=logdir / stage,
            metric=main_metric,
            metric_minimization=minimize_metric,
            save_n_best=3,
        )

        for ep in range(1, n_epochs + 1):
            log(f"[Epoch {ep}/{n_epochs}]")
            train_metrics = train_fn(
                model,
                train_loader,
                world_setup,
                criterion,
                optimizer,
                tb_logger=tb,
                last_iteration_index=train_batch_cnt,
            )
            if rank == 0:
                tb.add_scalars(f"{stage}/train", train_metrics, ep)
            train_batch_cnt += len(train_loader)

            valid_metrics = valid_fn(
                model,
                valid_loader,
                world_setup,
                criterion,
                tb_logger=tb,
                last_iteration_index=valid_batch_cnt,
            )
            valid_batch_cnt += len(valid_loader)
            if rank == 0:
                tb.add_scalars(f"{stage}/valid", valid_metrics, ep)

                # store checkpoints
                checkpointer.process(
                    score=valid_metrics[main_metric],
                    epoch=ep,
                    checkpoint=make_checkpoint(
                        stage,
                        ep,
                        model,
                        optimizer,
                        metrics={
                            "train": train_metrics,
                            "valid": valid_metrics
                        },
                    ),
                )

            log("[{}/{}] train: loss - {}, accuracy - {}".format(
                ep, n_epochs, train_metrics["loss"],
                train_metrics["accuracy"]))
            log("[{}/{}] valid: loss - {}, accuracy - {}".format(
                ep, n_epochs, valid_metrics["loss"],
                valid_metrics["accuracy"]))

    cleanup()
Exemple #7
0
def main(eval_args):
    # ensures that weight initializations are all the same
    logging = utils.Logger(eval_args.local_rank, eval_args.save)

    # load a checkpoint
    logging.info('loading the model at:')
    logging.info(eval_args.checkpoint)
    checkpoint = torch.load(eval_args.checkpoint, map_location='cpu')
    args = checkpoint['args']

    if not hasattr(args, 'ada_groups'):
        logging.info('old model, no ada groups was found.')
        args.ada_groups = False

    if not hasattr(args, 'min_groups_per_scale'):
        logging.info('old model, no min_groups_per_scale was found.')
        args.min_groups_per_scale = 1

    if not hasattr(args, 'num_mixture_dec'):
        logging.info('old model, no num_mixture_dec was found.')
        args.num_mixture_dec = 10

    logging.info('loaded the model at epoch %d', checkpoint['epoch'])
    arch_instance = utils.get_arch_cells(args.arch_instance)
    model = AutoEncoder(args, None, arch_instance)
    # Loading is not strict because of self.weight_normalized in Conv2D class in neural_operations. This variable
    # is only used for computing the spectral normalization and it is safe not to load it. Some of our earlier models
    # did not have this variable.
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    model = model.cuda()

    logging.info('args = %s', args)
    logging.info('num conv layers: %d', len(model.all_conv_layers))
    logging.info('param size = %fM ', utils.count_parameters_in_M(model))

    if eval_args.eval_mode == 'evaluate':
        # load train valid queue
        args.data = eval_args.data
        train_queue, valid_queue, num_classes = datasets.get_loaders(args)

        if eval_args.eval_on_train:
            logging.info('Using the training data for eval.')
            valid_queue = train_queue

        # get number of bits
        num_output = utils.num_output(args.dataset)
        bpd_coeff = 1. / np.log(2.) / num_output

        valid_neg_log_p, valid_nelbo = test(
            valid_queue,
            model,
            num_samples=eval_args.num_iw_samples,
            args=args,
            logging=logging)
        logging.info('final valid nelbo %f', valid_nelbo)
        logging.info('final valid neg log p %f', valid_neg_log_p)
        logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff)
        logging.info('final valid neg log p in bpd %f',
                     valid_neg_log_p * bpd_coeff)

    else:
        bn_eval_mode = not eval_args.readjust_bn
        num_samples = 16
        with torch.no_grad():
            n = int(np.floor(np.sqrt(num_samples)))
            set_bn(model,
                   bn_eval_mode,
                   num_samples=36,
                   t=eval_args.temp,
                   iter=500)
            for ind in range(10):  # sampling is repeated.
                torch.cuda.synchronize()
                start = time()
                with autocast():
                    logits = model.sample(num_samples, eval_args.temp)
                output = model.decoder_output(logits)
                output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \
                    else output.sample()
                torch.cuda.synchronize()
                end = time()

                # save images to 'results/eval-x/images/epochn' where x is exp id and n is epoch muber
                # print("tensor shape: {}".format(output_img.shape))
                # try saving the images one my one
                path_to_images = '/content/gdrive/MyDrive/pipeline_results/NVAE/results/eval-1/images'
                if not os.path.exists(path_to_images):
                    os.makedirs(path_to_images)
                for i in range(output_img.size(0)):
                    vutils.save_image(output_img[i, :, :, :],
                                      '%s/sample_batch%03d_img%03d.png' %
                                      (path_to_images, ind + 1, i + 1),
                                      normalize=True)
def main(config):
    model = load_model(config)
    train_loader, val_loader = get_loaders(model, config)

    # Make dirs
    if not os.path.exists(config.checkpoints):
        os.makedirs(config.checkpoints, exist_ok=True)
    if not os.path.exists(config.save_path):
        os.makedirs(config.save_path, exist_ok=True)

    # Loss Functions
    criterion_GAN = mse_loss

    # Calculate output of image discriminator (PatchGAN)
    patch = (1, config.image_size // 2**4, config.image_size // 2**4)

    # Initialize
    vgg = Vgg16().to(config.device)
    resnet = ResNet18(requires_grad=True, pretrained=True).to(config.device)
    generator = GeneratorUNet().to(config.device)
    discriminator = Discriminator().to(config.device)

    if config.epoch != 0:
        # Load pretrained models
        resnet.load_state_dict(
            torch.load(
                os.path.join(config.checkpoints, 'epoch_%d_%s.pth' %
                             (config.epoch - 1, 'resnet'))))
        generator.load_state_dict(
            torch.load(
                os.path.join(
                    config.checkpoints,
                    'epoch_%d_%s.pth' % (config.epoch - 1, 'generator'))))
        discriminator.load_state_dict(
            torch.load(
                os.path.join(
                    config.checkpoints,
                    'epoch_%d_%s.pth' % (config.epoch - 1, 'discriminator'))))
    else:
        # Initialize weights
        # resnet.apply(weights_init_normal)
        generator.apply(weights_init_normal)
        discriminator.apply(weights_init_normal)

    # Optimizers
    optimizer_resnet = torch.optim.Adam(resnet.parameters(),
                                        lr=config.lr,
                                        betas=(config.b1, config.b2))
    optimizer_G = torch.optim.Adam(generator.parameters(),
                                   lr=config.lr,
                                   betas=(config.b1, config.b2))
    optimizer_D = torch.optim.Adam(discriminator.parameters(),
                                   lr=config.lr,
                                   betas=(config.b1, config.b2))

    # ----------
    #  Training
    # ----------

    resnet.train()
    generator.train()
    discriminator.train()
    for epoch in range(config.epoch, config.n_epochs):
        for i, (im1, m1, im2, m2) in enumerate(train_loader):
            assert im1.size(0) == im2.size(0)
            valid = Variable(torch.Tensor(np.ones(
                (im1.size(0), *patch))).to(config.device),
                             requires_grad=False)
            fake = Variable(torch.Tensor(np.ones(
                (im1.size(0), *patch))).to(config.device),
                            requires_grad=False)

            # ------------------
            #  Train Generators
            # ------------------

            optimizer_resnet.zero_grad()
            optimizer_G.zero_grad()

            # GAN loss
            z = resnet(im2 * m2)
            if epoch < config.gan_epochs:
                fake_im = generator(im1 * (1 - m1), im2 * m2, z)
            else:
                fake_im = generator(im1, im2, z)
            if epoch < config.gan_epochs:
                pred_fake = discriminator(fake_im, im2)
                gan_loss = config.lambda_gan * criterion_GAN(pred_fake, valid)
            else:
                gan_loss = torch.Tensor([0]).to(config.device)

            # Hair, Face loss
            fake_m2 = torch.argmax(model(fake_im),
                                   1).unsqueeze(1).type(torch.uint8).repeat(
                                       1, 3, 1, 1).to(config.device)
            if 0.5 * torch.sum(m1) <= torch.sum(
                    fake_m2) <= 1.5 * torch.sum(m1):
                hair_loss = config.lambda_style * calc_style_loss(
                    fake_im * fake_m2, im2 * m2, vgg) + calc_content_loss(
                        fake_im * fake_m2, im2 * m2, vgg)
                face_loss = calc_content_loss(fake_im, im1, vgg)
            else:
                hair_loss = config.lambda_style * calc_style_loss(
                    fake_im * m1, im2 * m2, vgg) + calc_content_loss(
                        fake_im * m1, im2 * m2, vgg)
                face_loss = calc_content_loss(fake_im, im1, vgg)
            hair_loss *= config.lambda_hair
            face_loss *= config.lambda_face

            # Total loss
            loss = gan_loss + hair_loss + face_loss

            loss.backward()
            optimizer_resnet.step()
            optimizer_G.step()

            # ---------------------
            #  Train Discriminator
            # ---------------------

            if epoch < config.gan_epochs:
                optimizer_D.zero_grad()

                # Real loss
                pred_real = discriminator(im1 * (1 - m1) + im2 * m2, im2)
                loss_real = criterion_GAN(pred_real, valid)
                # Fake loss
                pred_fake = discriminator(fake_im.detach(), im2)
                loss_fake = criterion_GAN(pred_fake, fake)
                # Total loss
                loss_D = 0.5 * (loss_real + loss_fake)

                loss_D.backward()
                optimizer_D.step()

            if i % config.sample_interval == 0:
                msg = "Train || Gan loss: %.6f, hair loss: %.6f, face loss: %.6f, loss: %.6f\n" % \
                    (gan_loss.item(), hair_loss.item(), face_loss.item(), loss.item())
                sys.stdout.write("Epoch: %d || Batch: %d\n" % (epoch, i))
                sys.stdout.write(msg)
                fname = os.path.join(
                    config.save_path,
                    "Train_Epoch:%d_Batch:%d.png" % (epoch, i))
                sample_images([im1[0], im2[0], fake_im[0]],
                              ["img1", "img2", "img1+img2"], fname)
                for j, (im1, m1, im2, m2) in enumerate(val_loader):
                    with torch.no_grad():
                        valid = Variable(torch.Tensor(
                            np.ones((im1.size(0), *patch))).to(config.device),
                                         requires_grad=False)
                        fake = Variable(torch.Tensor(
                            np.ones((im1.size(0), *patch))).to(config.device),
                                        requires_grad=False)

                        # GAN loss
                        z = resnet(im2 * m2)
                        if epoch < config.gan_epochs:
                            fake_im = generator(im1 * (1 - m1), im2 * m2, z)
                        else:
                            fake_im = generator(im1, im2, z)

                        if epoch < config.gan_epochs:
                            pred_fake = discriminator(fake_im, im2)
                            gan_loss = config.lambda_gan * criterion_GAN(
                                pred_fake, valid)
                        else:
                            gan_loss = torch.Tensor([0]).to(config.device)

                        # Hair, Face loss
                        fake_m2 = torch.argmax(
                            model(fake_im),
                            1).unsqueeze(1).type(torch.uint8).repeat(
                                1, 3, 1, 1).to(config.device)
                        if 0.5 * torch.sum(m1) <= torch.sum(
                                fake_m2) <= 1.5 * torch.sum(m1):
                            hair_loss = config.lambda_style * calc_style_loss(
                                fake_im * fake_m2, im2 * m2,
                                vgg) + calc_content_loss(
                                    fake_im * fake_m2, im2 * m2, vgg)
                            face_loss = calc_content_loss(fake_im, im1, vgg)
                        else:
                            hair_loss = config.lambda_style * calc_style_loss(
                                fake_im * m1, im2 * m2,
                                vgg) + calc_content_loss(
                                    fake_im * m1, im2 * m2, vgg)
                            face_loss = calc_content_loss(fake_im, im1, vgg)
                        hair_loss *= config.lambda_hair
                        face_loss *= config.lambda_face

                        # Total loss
                        loss = gan_loss + hair_loss + face_loss

                        msg = "Validation || Gan loss: %.6f, hair loss: %.6f, face loss: %.6f, loss: %.6f\n" % \
                                (gan_loss.item(), hair_loss.item(), face_loss.item(), loss.item())
                        sys.stdout.write(msg)
                        fname = os.path.join(
                            config.save_path,
                            "Validation_Epoch:%d_Batch:%d.png" % (epoch, i))
                        sample_images([im1[0], im2[0], fake_im[0]],
                                      ["img1", "img2", "img1+img2"], fname)
                        break

        if epoch % config.checkpoint_interval == 0:
            if epoch < config.gan_epochs:
                models = [resnet, generator, discriminator]
                fnames = ['resnet', 'generator', 'discriminator']
            else:
                models = [resnet, generator]
                fnames = ['resnet', 'generator']
            fnames = [
                os.path.join(config.checkpoints,
                             'epoch_%d_%s.pth' % (epoch, s)) for s in fnames
            ]
            save_weights(models, fnames)
Exemple #9
0
    } if not args.use_cpu else {}

    ckpt_dir = os.path.join(args.ckpt_dir, args.run_name)
    os.makedirs(ckpt_dir, exist_ok=True)
    log_dir = os.path.join(args.log_dir, args.run_name)
    os.makedirs(log_dir, exist_ok=True)
    logger = SummaryWriter(log_dir)

    jitter_size = args.resize + 30  # random jitter from pix2pix
    tf = transforms.Compose([
        transforms.Resize(jitter_size, Image.ANTIALIAS),
        transforms.RandomCrop((args.resize, args.resize)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
    ])
    train_loader, test_loader = get_loaders(args, tf)

    G = Generator(in_channels=3, out_channels=3,
                  n_blocks=9).to(device)  # A to B
    F = Generator(in_channels=3, out_channels=3,
                  n_blocks=9).to(device)  # B to A
    D_A = Discriminator(in_channels=3).to(device)
    D_B = Discriminator(in_channels=3).to(device)
    nets = [G, F, D_A, D_B]
    for net in nets:
        net.apply(init_weights_gaussian)

    G_opt = optim.Adam(G.parameters(), lr=args.lr)
    F_opt = optim.Adam(F.parameters(), lr=args.lr)
    D_A_opt = optim.Adam(D_A.parameters(), lr=args.lr)
    D_B_opt = optim.Adam(D_B.parameters(), lr=args.lr)
                    metavar='str',
                    help='dir to save checkpoints (default: ./checkpoints)')
parser.add_argument(
    '--vis_dir',
    type=str,
    default=r'./val_out',
    metavar='str',
    help='dir to save results during training (default: ./val_out)')
parser.add_argument('--lr',
                    type=float,
                    default=2e-4,
                    help='learning rate (default: 0.0002)')
parser.add_argument('--max_num_epochs',
                    type=int,
                    default=100,
                    metavar='N',
                    help='max number of training epochs (default 200)')
parser.add_argument(
    '--scheduler_step_size',
    type=int,
    default=50,
    metavar='N',
    help='after m epochs then reduce lr to 0.1*lr (default 500)')
args = parser.parse_args()

if __name__ == '__main__':

    dataloaders = datasets.get_loaders(args)
    nn_classifier = Classifier(args=args, dataloaders=dataloaders)
    nn_classifier.train_models()
    logger['args'] = args
    logger['checkpoint'] = os.path.join('models/', logger.index + '.pth')
    logger['checkpoint_step'] = os.path.join('models/',
                                             logger.index + '_{}.pth')

    print("[Logging in {}]".format(logger.index))

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    args.device = torch.device("cuda" if use_cuda else "cpu")

    os.makedirs('checkpoints', exist_ok=True)

    train_loader, valid_loader, test_loader = datasets.get_loaders(
        args.dataset,
        class_to_replace=args.forget_class,
        num_indexes_to_replace=args.num_to_forget,
        batch_size=args.batch_size,
        seed=args.seed,
        augment=args.augment)

    num_classes = max(train_loader.dataset.targets
                      ) + 1 if args.num_classes is None else args.num_classes
    args.num_classes = num_classes
    print(f"Number of Classes: {num_classes}")
    model = models.get_model(args.model,
                             num_classes=num_classes,
                             filters_percentage=args.filters).to(args.device)

    if args.model == 'allcnn': classifier_name = 'classifier.'
    elif 'resnet' in args.model: classifier_name = 'linear.'
Exemple #12
0
def main(eval_args):
    # ensures that weight initializations are all the same
    logging = utils.Logger(eval_args.local_rank, eval_args.save)

    # load a checkpoint
    logging.info('loading the model at:')
    logging.info(eval_args.checkpoint)
    checkpoint = torch.load(eval_args.checkpoint, map_location='cpu')
    args = checkpoint['args']

    logging.info('loaded the model at epoch %d', checkpoint['epoch'])
    arch_instance = utils.get_arch_cells(args.arch_instance)
    model = AutoEncoder(args, None, arch_instance)
    model.load_state_dict(checkpoint['state_dict'])
    model = model.cuda()

    logging.info('args = %s', args)
    logging.info('num conv layers: %d', len(model.all_conv_layers))
    logging.info('param size = %fM ', utils.count_parameters_in_M(model))

    if eval_args.eval_mode == 'evaluate':
        # load train valid queue
        args.data = eval_args.data
        train_queue, valid_queue, num_classes, test_queue = datasets.get_loaders(args)

        if eval_args.eval_on_train:
            logging.info('Using the training data for eval.')
            valid_queue = train_queue
        if eval_args.eval_on_test:
            logging.info('Using the test data for eval.')
            valid_queue = test_queue

        # get number of bits
        num_output = utils.num_output(args.dataset, args)
        bpd_coeff = 1. / np.log(2.) / num_output

        valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=eval_args.num_iw_samples, args=args, logging=logging)
        logging.info('final valid nelbo %f', valid_nelbo)
        logging.info('final valid neg log p %f', valid_neg_log_p)
        logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff)
        logging.info('final valid neg log p in bpd %f', valid_neg_log_p * bpd_coeff)

    else:
        bn_eval_mode = not eval_args.readjust_bn
        num_samples = 16
        with torch.no_grad():
            n = int(np.floor(np.sqrt(num_samples)))
            set_bn(model, bn_eval_mode, num_samples=36, t=eval_args.temp, iter=500)
            for ind in range(eval_args.repetition):     # sampling is repeated.
                torch.cuda.synchronize()
                start = time()
                with autocast():
                    logits = model.sample(num_samples, eval_args.temp)
                output = model.decoder_output(logits)
                output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \
                    else output.sample()
                torch.cuda.synchronize()
                end = time()

                # save to file
                total_name = "{}/data_to_save_{}_{}.pickle".format(eval_args.save, eval_args.name_to_save, ind)
                with open(total_name, 'wb') as handle:
                    pickle.dump(output_img.deatach().numpy(), handle, protocol=pickle.HIGHEST_PROTOCOL)

                output_tiled = utils.tile_image(output_img, n).cpu().numpy().transpose(1, 2, 0)
                logging.info('sampling time per batch: %0.3f sec', (end - start))
                output_tiled = np.asarray(output_tiled * 255, dtype=np.uint8)
                output_tiled = np.squeeze(output_tiled)

                plt.imshow(output_tiled)
                plt.savefig("{}/generation_{}_{}".format(eval_args.save, eval_args.name_to_save, ind))