Example #1
0
        x1 = schedule.forward(input_,
                              *list(model.state_dict(keep_vars=True).values()))

        print(
            'Forward mean absolute difference',
            abs(x0[0] -
                x1).mean() if 'googlenet' in args.model else abs(x0 -
                                                                 x1).mean())

        schedule.backward(-torch.ones_like(x1))

        print('Gradient of normal model')
        gradient_diff = [
            "{:.5f} {} {}".format(float(v.grad.mean()), n, v.shape)
            for n, v in model.named_parameters() if v.grad is not None
        ]
        for gd in gradient_diff:
            print(gd)
        exit()

    if args.run_bs:
        graph = Graph.create(model, input_shape=(3, height, width))
        model.cuda()
        solvert = -1
        bs = int(args.bs)
        print("Solver trying batch size %d" % bs)
        if len(args.solution_file) > 0:
            solver_info, solution = load_solution(args.solution_file)
        else:
            input_ = torch.randn((bs, 3, height, width)).cuda()
def train_eval_model(opts):
    # parse model configuration
    num_epochs = opts["num_epochs"]
    train_batch_size = opts["train_batch_size"]
    val_batch_size = opts["eval_batch_size"]
    dataset_type = opts["dataset_type"]

    opti_mode = opts["optimizer"]
    loss_criterion = opts["loss_criterion"]
    lr = opts["lr"]
    lr_decay = opts["lr_decay"]
    wd = opts["weight_decay"]

    gpus = opts["gpu_list"].split(',')
    os.environ['CUDA_VISIBLE_DEVICE'] = opts["gpu_list"]
    train_dir = opts["log_dir"]

    train_data_dir = opts["train_data_dir"]
    eval_data_dir = opts["eval_data_dir"]

    pretrained = opts["pretrained_model"]
    resume = opts["resume"]
    display_iter = opts["display_iter"]
    save_epoch = opts["save_every_epoch"]
    show = opts["vis"]

    # backup train configs
    log_file = os.path.join(train_dir, "log_file.txt")
    os.makedirs(train_dir, exist_ok=True)
    model_dir = os.path.join(train_dir, "code_backup")
    os.makedirs(model_dir, exist_ok=True)
    if resume is None and os.path.exists(log_file): os.remove(log_file)
    shutil.copy("./models/unet.py", os.path.join(model_dir, "unet.py"))
    shutil.copy("./trainer_unet.py", os.path.join(model_dir,
                                                  "trainer_unet.py"))
    shutil.copy("./datasets/dataset.py", os.path.join(model_dir, "dataset.py"))

    ckt_dir = os.path.join(train_dir, "checkpoints")
    os.makedirs(ckt_dir, exist_ok=True)

    # format printing configs
    print("*" * 50)
    table_key = []
    table_value = []
    n = 0
    for key, value in opts.items():
        table_key.append(key)
        table_value.append(str(value))
        n += 1
    print_table([table_key, ["="] * n, table_value])

    # format gpu list
    gpu_list = []
    for str_id in gpus:
        id = int(str_id)
        gpu_list.append(id)

    # dataloader
    print("==> Create dataloader")
    dataloaders_dict = {
        "train":
        er_data_loader(train_data_dir,
                       train_batch_size,
                       dataset_type,
                       is_train=True),
        "eval":
        er_data_loader(eval_data_dir,
                       val_batch_size,
                       dataset_type,
                       is_train=False)
    }

    # define parameters of two networks
    print("==> Create network")
    num_channels = 1
    num_classes = 1
    model = UNet(num_channels, num_classes)
    init_weights(model)

    # loss layer
    criterion = create_criterion(criterion=loss_criterion)

    best_acc = 0.0
    start_epoch = 0

    # load pretrained model
    if pretrained is not None and os.path.isfile(pretrained):
        print("==> Train from model '{}'".format(pretrained))
        checkpoint_gan = torch.load(pretrained)
        model.load_state_dict(checkpoint_gan['model_state_dict'])
        print("==> Loaded checkpoint '{}')".format(pretrained))
        for param in model.parameters():
            param.requires_grad = False

    # resume training
    elif resume is not None and os.path.isfile(resume):
        print("==> Resume from checkpoint '{}'".format(resume))
        checkpoint = torch.load(resume)
        start_epoch = checkpoint['epoch'] + 1
        best_acc = checkpoint['best_acc']
        model_dict = model.state_dict()
        pretrained_dict = {
            k: v
            for k, v in checkpoint['model_state_dict'].items()
            if k in model_dict and v.size() == model_dict[k].size()
        }
        model_dict.update(pretrained_dict)
        model.load_state_dict(pretrained_dict)
        print("==> Loaded checkpoint '{}' (epoch {})".format(
            resume, checkpoint['epoch'] + 1))

    # train from scratch
    else:
        print("==> Train from initial or random state.")

    # define mutiple-gpu mode
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.cuda()
    model = nn.DataParallel(model)

    # print learnable parameters
    print("==> List learnable parameters")
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            print("\t{}, size {}".format(name, param.size()))
    params_to_update = [{'params': model.parameters()}]

    # define optimizer
    print("==> Create optimizer")
    optimizer = create_optimizer(params_to_update,
                                 opti_mode,
                                 lr=lr,
                                 momentum=0.9,
                                 wd=wd)
    if resume is not None and os.path.isfile(resume):
        optimizer.load_state_dict(checkpoint['optimizer'])

    # start training
    since = time.time()

    # Each epoch has a training and validation phase
    print("==> Start training")
    total_steps = 0

    for epoch in range(start_epoch, num_epochs):

        print('-' * 50)
        print("==> Epoch {}/{}".format(epoch + 1, num_epochs))

        total_steps = train_one_epoch(epoch, total_steps,
                                      dataloaders_dict['train'], model, device,
                                      criterion, optimizer, lr, lr_decay,
                                      display_iter, log_file, show)

        epoch_acc, epoch_iou, epoch_f1 = eval_one_epoch(
            epoch, dataloaders_dict['eval'], model, device, log_file)

        if best_acc < epoch_acc and epoch >= 5:
            best_acc = epoch_acc
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_acc': best_acc
                }, os.path.join(ckt_dir, "best.pth"))

        if (epoch + 1) % save_epoch == 0 and (epoch + 1) >= 20:
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_iou': epoch_iou
                },
                os.path.join(ckt_dir,
                             "checkpoints_" + str(epoch + 1) + ".pth"))

    time_elapsed = time.time() - since
    time_message = 'Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60)
    print(time_message)
    with open(log_file, "a+") as fid:
        fid.write('%s\n' % time_message)
    print('==> Best val Acc: {:4f}'.format(best_acc))