Beispiel #1
0
def train(config, num_workers, num_threads, cuda, restart_train, mGPU):
    # torch.set_num_threads(num_threads)

    train_config = config['training']
    arch_config = config['architecture']

    batch_size = train_config['batch_size']
    lr = train_config['learning_rate']
    weight_decay = train_config['weight_decay']
    decay_step = train_config['decay_steps']
    lr_decay = train_config['lr_decay']

    n_epoch = train_config['num_epochs']
    use_cache = train_config['use_cache']

    print('Configs:', config)
    # checkpoint path
    checkpoint_dir = train_config['checkpoint_dir']
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # logs path
    logs_dir = train_config['logs_dir']
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)
    shutil.rmtree(logs_dir)
    log_writer = SummaryWriter(logs_dir)

    # dataset and dataloader
    data_set = TrainDataSet(train_config['dataset_configs'],
                            img_format='.bmp',
                            degamma=True,
                            color=False,
                            blind=arch_config['blind_est'])
    data_loader = DataLoader(data_set,
                             batch_size=batch_size,
                             shuffle=True,
                             num_workers=num_workers)
    dataset_config = read_config(train_config['dataset_configs'],
                                 _configspec_path())['dataset_configs']

    # model here
    model = KPN(color=False,
                burst_length=dataset_config['burst_length'],
                blind_est=arch_config['blind_est'],
                kernel_size=list(map(int, arch_config['kernel_size'].split())),
                sep_conv=arch_config['sep_conv'],
                channel_att=arch_config['channel_att'],
                spatial_att=arch_config['spatial_att'],
                upMode=arch_config['upMode'],
                core_bias=arch_config['core_bias'])
    if cuda:
        model = model.cuda()

    if mGPU:
        model = nn.DataParallel(model)
    model.train()

    # loss function here
    loss_func = LossFunc(coeff_basic=1.0,
                         coeff_anneal=1.0,
                         gradient_L1=True,
                         alpha=arch_config['alpha'],
                         beta=arch_config['beta'])

    # Optimizer here
    if train_config['optimizer'] == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif train_config['optimizer'] == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=lr,
                              momentum=0.9,
                              weight_decay=weight_decay)
    else:
        raise ValueError(
            "Optimizer must be 'sgd' or 'adam', but received {}.".format(
                train_config['optimizer']))
    optimizer.zero_grad()

    # learning rate scheduler here
    scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=lr_decay)

    average_loss = MovingAverage(train_config['save_freq'])
    if not restart_train:
        try:
            checkpoint = load_checkpoint(checkpoint_dir, 'best')
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_iter']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['lr_scheduler'])
            print('=> loaded checkpoint (epoch {}, global_step {})'.format(
                start_epoch, global_step))
        except:
            start_epoch = 0
            global_step = 0
            best_loss = np.inf
            print('=> no checkpoint file to be loaded.')
    else:
        start_epoch = 0
        global_step = 0
        best_loss = np.inf
        if os.path.exists(checkpoint_dir):
            pass
            # files = os.listdir(checkpoint_dir)
            # for f in files:
            #     os.remove(os.path.join(checkpoint_dir, f))
        else:
            os.mkdir(checkpoint_dir)
        print('=> training')

    burst_length = dataset_config['burst_length']
    data_length = burst_length if arch_config['blind_est'] else burst_length + 1
    patch_size = dataset_config['patch_size']

    for epoch in range(start_epoch, n_epoch):
        epoch_start_time = time.time()
        # decay the learning rate
        lr_cur = [param['lr'] for param in optimizer.param_groups]
        if lr_cur[0] > 5e-6:
            scheduler.step()
        else:
            for param in optimizer.param_groups:
                param['lr'] = 5e-6
        print(
            '=' * 20,
            'lr={}'.format([param['lr'] for param in optimizer.param_groups]),
            '=' * 20)
        t1 = time.time()
        for step, (burst_noise, gt, white_level) in enumerate(data_loader):
            if cuda:
                burst_noise = burst_noise.cuda()
                gt = gt.cuda()
            # print('white_level', white_level, white_level.size())

            #
            pred_i, pred = model(burst_noise, burst_noise[:, 0:burst_length,
                                                          ...], white_level)

            #
            loss_basic, loss_anneal = loss_func(sRGBGamma(pred_i),
                                                sRGBGamma(pred), sRGBGamma(gt),
                                                global_step)
            loss = loss_basic + loss_anneal
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # update the average loss
            average_loss.update(loss)
            # calculate PSNR
            psnr = calculate_psnr(pred.unsqueeze(1), gt.unsqueeze(1))
            ssim = calculate_ssim(pred.unsqueeze(1), gt.unsqueeze(1))

            # add scalars to tensorboardX
            log_writer.add_scalar('loss_basic', loss_basic, global_step)
            log_writer.add_scalar('loss_anneal', loss_anneal, global_step)
            log_writer.add_scalar('loss_total', loss, global_step)
            log_writer.add_scalar('psnr', psnr, global_step)
            log_writer.add_scalar('ssim', ssim, global_step)

            # print
            print(
                '{:-4d}\t| epoch {:2d}\t| step {:4d}\t| loss_basic: {:.4f}\t| loss_anneal: {:.4f}\t|'
                ' loss: {:.4f}\t| PSNR: {:.2f}dB\t| SSIM: {:.4f}\t| time:{:.2f} seconds.'
                .format(global_step, epoch, step, loss_basic, loss_anneal,
                        loss, psnr, ssim,
                        time.time() - t1))
            t1 = time.time()
            # global_step
            global_step += 1

            if global_step % train_config['save_freq'] == 0:
                if average_loss.get_value() < best_loss:
                    is_best = True
                    best_loss = average_loss.get_value()
                else:
                    is_best = False

                save_dict = {
                    'epoch': epoch,
                    'global_iter': global_step,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': scheduler.state_dict()
                }
                save_checkpoint(save_dict,
                                is_best,
                                checkpoint_dir,
                                global_step,
                                max_keep=train_config['ckpt_to_keep'])

        print('Epoch {} is finished, time elapsed {:.2f} seconds.'.format(
            epoch,
            time.time() - epoch_start_time))
Beispiel #2
0
def train(args):
    # torch.set_num_threads(4)
    # torch.manual_seed(args.seed)
    # checkpoint = utility.checkpoint(args)
    data_set = SingleLoader(noise_dir=args.noise_dir,
                            gt_dir=args.gt_dir,
                            image_size=args.image_size)
    data_loader = DataLoader(data_set,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    loss_basic = BasicLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint_dir = args.checkpoint
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    model = MWRN_lv3().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               [5, 10, 15, 20, 25, 30], 0.5)
    optimizer.zero_grad()
    average_loss = MovingAverage(args.save_every)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    try:
        checkpoint = load_checkpoint(checkpoint_dir, device == 'cuda',
                                     'latest')
        start_epoch = checkpoint['epoch']
        global_step = checkpoint['global_iter']
        best_loss = checkpoint['best_loss']
        state_dict = checkpoint['state_dict']

        model.load_state_dict(state_dict)
        optimizer.load_state_dict(checkpoint['optimizer'])
        print('=> loaded checkpoint (epoch {}, global_step {})'.format(
            start_epoch, global_step))
    except:
        start_epoch = 0
        global_step = 0
        best_loss = np.inf
        print('=> no checkpoint file to be loaded.')
    DWT = common.DWT()
    param = [x for name, x in model.named_parameters()]
    clip_grad_D = 1e4
    grad_norm_D = 0
    for epoch in range(start_epoch, args.epoch):
        for step, (noise, gt) in enumerate(data_loader):
            noise = noise.to(device)
            gt = gt.to(device)
            x1 = DWT(gt).to(device)
            x2 = DWT(x1).to(device)
            x3 = DWT(x2).to(device)

            y1 = DWT(noise).to(device)
            y2 = DWT(y1).to(device)
            y3 = DWT(y2).to(device)
            lv3_out, img_lv3 = model(y3, None)
            scale_loss_lv3 = loss_basic(x3, img_lv3)
            loss = scale_loss_lv3
            optimizer.zero_grad()
            loss.backward()
            total_norm_D = nn.utils.clip_grad_norm_(param, clip_grad_D)
            grad_norm_D = (grad_norm_D * (step / (step + 1)) + total_norm_D /
                           (step + 1))
            optimizer.step()
            average_loss.update(loss)
            if global_step % args.save_every == 0:
                print("Save : epoch ", epoch,
                      " step : ", global_step, " with avg loss : ",
                      average_loss.get_value(), ",   best loss : ", best_loss)
                if average_loss.get_value() < best_loss:
                    is_best = True
                    best_loss = average_loss.get_value()
                else:
                    is_best = False
                save_dict = {
                    'epoch': epoch,
                    'global_iter': global_step,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                }
                save_checkpoint(save_dict, is_best, checkpoint_dir,
                                global_step)
            if global_step % args.loss_every == 0:
                print(global_step, ": ", average_loss.get_value())
            global_step += 1
        clip_grad_D = min(clip_grad_D, grad_norm_D)
        scheduler.step()
        print("Epoch : ", epoch, "end at step: ", global_step)
Beispiel #3
0
def train():
    log_writer = SummaryWriter('./logs')
    parser = argparse.ArgumentParser()
    parser.add_argument('--restart', '-r', action='store_true')
    args = parser.parse_args()

    config = read_config('kpn_specs/att_kpn_config.conf', 'kpn_specs/configspec.conf')
    train_config = config['training']
    data_set = TrainDataSet(
        train_config['dataset_configs'],
        img_format='.bmp',
        degamma=True,
        color=True,
        blind=False
    )
    data_loader = DataLoader(
        dataset=data_set,
        batch_size=32,
        shuffle=True,
        num_workers=4
    )

    loss_fn = nn.L1Loss()

    model = Network(True).cuda()

    model.train()

    optimizer = optim.Adam(model.parameters(), lr=5e-5)

    if not args.restart:
        model.load_state_dict(load_checkpoint('./noise_models', best_or_latest='best'))
    global_iter = 0
    min_loss = np.inf
    loss_ave = MovingAverage(200)

    import os
    if not os.path.exists('./noise_models'):
        os.mkdir('./noise_models')

    for epoch in range(100):
        for step, (data, A, B) in enumerate(data_loader):
            feed = data[:, 0, ...].cuda()
            gt = data[:, -1, ...].cuda()
            # print(data.size())
            pred = model(feed)

            loss = loss_fn(pred, gt)

            global_iter += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            log_writer.add_scalar('loss', loss, global_iter)

            loss_ave.update(loss)
            if global_iter % 200 == 0:
                loss_t = loss_ave.get_value()
                min_loss = min(min_loss, loss_t)
                is_best = min_loss == loss_t
                save_checkpoint(
                    model.state_dict(),
                    is_best=is_best,
                    checkpoint_dir='./noise_models',
                    n_iter=global_iter
                )
            print('{: 6d}, epoch {: 3d}, iter {: 4d}, loss {:.4f}'.format(global_iter, epoch, step, loss))
Beispiel #4
0
def train(args):
    torch.set_num_threads(args.num_workers)
    torch.manual_seed(0)
    if args.data_type == 'rgb':
        data_set = SingleLoader(noise_dir=args.noise_dir,
                                gt_dir=args.gt_dir,
                                image_size=args.image_size)
    elif args.data_type == 'raw':
        data_set = SingleLoader_raw(noise_dir=args.noise_dir,
                                    gt_dir=args.gt_dir,
                                    image_size=args.image_size)
    else:
        print("Data type not valid")
        exit()
    data_loader = DataLoader(data_set,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             pin_memory=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss_func = losses.CharbonnierLoss().to(device)
    # loss_func = losses.AlginLoss().to(device)
    adaptive = robust_loss.adaptive.AdaptiveLossFunction(
        num_dims=3 * args.image_size**2, float_dtype=np.float32, device=device)
    checkpoint_dir = args.checkpoint
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if args.model_type == "MIR":
        model = MIRNet(in_channels=args.n_colors,
                       out_channels=args.out_channels).to(device)
    elif args.model_type == "KPN":
        model = MIRNet_kpn(in_channels=args.n_colors,
                           out_channels=args.out_channels).to(device)
    else:
        print(" Model type not valid")
        return
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    optimizer.zero_grad()
    average_loss = MovingAverage(args.save_every)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               [2, 4, 6, 8, 10, 12, 14, 16],
                                               0.8)
    if args.restart:
        start_epoch = 0
        global_step = 0
        best_loss = np.inf
        print('=> no checkpoint file to be loaded.')
    else:
        try:
            checkpoint = load_checkpoint(checkpoint_dir, device == 'cuda',
                                         'latest')
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_iter']
            best_loss = checkpoint['best_loss']
            state_dict = checkpoint['state_dict']
            # new_state_dict = OrderedDict()
            # for k, v in state_dict.items():
            #     name = "model."+ k  # remove `module.`
            #     new_state_dict[name] = v
            model.load_state_dict(state_dict)
            optimizer.load_state_dict(checkpoint['optimizer'])
            print('=> loaded checkpoint (epoch {}, global_step {})'.format(
                start_epoch, global_step))
        except:
            start_epoch = 0
            global_step = 0
            best_loss = np.inf
            print('=> no checkpoint file to be loaded.')
    eps = 1e-4
    for epoch in range(start_epoch, args.epoch):
        for step, (noise, gt) in enumerate(data_loader):
            noise = noise.to(device)
            gt = gt.to(device)
            pred = model(noise)
            # print(pred.size())
            loss = loss_func(pred, gt)
            # bs = gt.size()[0]
            # diff = noise - gt
            # loss = torch.sqrt((diff * diff) + (eps * eps))
            # loss = loss.view(bs,-1)
            # loss = adaptive.lossfun(loss)
            # loss = torch.mean(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            average_loss.update(loss)
            if global_step % args.save_every == 0:
                print(len(average_loss._cache))
                if average_loss.get_value() < best_loss:
                    is_best = True
                    best_loss = average_loss.get_value()
                else:
                    is_best = False

                save_dict = {
                    'epoch': epoch,
                    'global_iter': global_step,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                }
                save_checkpoint(save_dict, is_best, checkpoint_dir,
                                global_step)
            if global_step % args.loss_every == 0:
                print(global_step, "PSNR  : ", calculate_psnr(pred, gt))
                print(average_loss.get_value())
            global_step += 1
        print('Epoch {} is finished.'.format(epoch))
        scheduler.step()
Beispiel #5
0
def train(num_workers, cuda, restart_train, mGPU):
    # torch.set_num_threads(num_threads)

    color = True
    batch_size = args.batch_size
    lr = 2e-4
    lr_decay = 0.89125093813
    n_epoch = args.epoch
    # num_workers = 8
    save_freq = args.save_every
    loss_freq = args.loss_every
    lr_step_size = 100
    burst_length = args.burst_length
    # checkpoint path
    checkpoint_dir = "checkpoints/" + args.checkpoint
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # logs path
    logs_dir = "checkpoints/logs/" + args.checkpoint
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)
    shutil.rmtree(logs_dir)
    log_writer = SummaryWriter(logs_dir)

    # dataset and dataloader
    data_set = SingleLoader_DGF(noise_dir=args.noise_dir,gt_dir=args.gt_dir,image_size=args.image_size,burst_length=burst_length)
    data_loader = DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    # model here
    if args.model_type == "attKPN":
        model = Att_KPN_noise_DGF(
            color=color,
            burst_length=burst_length,
            blind_est=False,
            kernel_size=[5],
            sep_conv=False,
            channel_att=True,
            spatial_att=True,
            upMode="bilinear",
            core_bias=False
        )
    elif args.model_type == "attWKPN":
        model = Att_Weight_KPN_noise_DGF(
            color=color,
            burst_length=burst_length,
            blind_est=False,
            kernel_size=[5],
            sep_conv=False,
            channel_att=True,
            spatial_att=True,
            upMode="bilinear",
            core_bias=False
        )
    elif args.model_type == 'KPN':
        model = KPN_noise_DGF(
            color=color,
            burst_length=burst_length,
            blind_est=False,
            kernel_size=[5],
            sep_conv=False,
            channel_att=False,
            spatial_att=False,
            upMode="bilinear",
            core_bias=False
        )
    else:
        print(" Model type not valid")
        return
    if cuda:
        model = model.cuda()

    if mGPU:
        model = nn.DataParallel(model)
    model.train()

    # loss function here
    loss_func = LossBasic()
    if args.wavelet_loss:
        print("Use wavelet loss")
        loss_func2 = WaveletLoss()
    # Optimizer here
    optimizer = optim.Adam(
        model.parameters(),
        lr=lr
    )

    optimizer.zero_grad()

    # learning rate scheduler here
    scheduler = lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=lr_decay)

    average_loss = MovingAverage(save_freq)
    if not restart_train:
        try:
            checkpoint = load_checkpoint(checkpoint_dir,cuda , best_or_latest=args.load_type)
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_iter']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['lr_scheduler'])
            print('=> loaded checkpoint (epoch {}, global_step {})'.format(start_epoch, global_step))
        except:
            start_epoch = 0
            global_step = 0
            best_loss = np.inf
            print('=> no checkpoint file to be loaded.')
    else:
        start_epoch = 0
        global_step = 0
        best_loss = np.inf
        if os.path.exists(checkpoint_dir):
            pass
            # files = os.listdir(checkpoint_dir)
            # for f in files:
            #     os.remove(os.path.join(checkpoint_dir, f))
        else:
            os.mkdir(checkpoint_dir)
        print('=> training')


    for epoch in range(start_epoch, n_epoch):
        epoch_start_time = time.time()
        # decay the learning rate

        # print('='*20, 'lr={}'.format([param['lr'] for param in optimizer.param_groups]), '='*20)
        t1 = time.time()
        for step, (image_noise_hr,image_noise_lr, image_gt_hr, _) in enumerate(data_loader):
            # print(burst_noise.size())
            # print(gt.size())
            if cuda:
                burst_noise = image_noise_lr.cuda()
                gt = image_gt_hr.cuda()
                image_noise_hr = image_noise_hr.cuda()
                noise_gt = (image_noise_hr-image_gt_hr).cuda()
            else:
                burst_noise = image_noise_lr
                gt = image_gt_hr
                noise_gt = image_noise_hr - image_gt_hr
            #
            _, pred,noise = model(burst_noise,image_noise_hr)
            # print(pred.size())
            #
            loss_basic = loss_func(pred, gt)
            loss_noise = loss_func(noise,noise_gt)
            loss = loss_basic + loss_noise
            if args.wavelet_loss:
                loss_wave = loss_func2(pred,gt)
                loss_wave_noise = loss_func2(noise,noise_gt)
                # print(loss_wave)
                loss = loss_basic + loss_wave + loss_noise + loss_wave_noise
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # update the average loss
            average_loss.update(loss)
            # global_step

            if not color:
                pred = pred.unsqueeze(1)
                gt = gt.unsqueeze(1)
            if global_step %loss_freq ==0:
                # calculate PSNR
                print("burst_noise  : ",burst_noise.size())
                print("gt   :  ",gt.size())
                psnr = calculate_psnr(pred, gt)
                ssim = calculate_ssim(pred, gt)

                # add scalars to tensorboardX
                log_writer.add_scalar('loss_basic', loss_basic, global_step)
                log_writer.add_scalar('loss_total', loss, global_step)
                log_writer.add_scalar('psnr', psnr, global_step)
                log_writer.add_scalar('ssim', ssim, global_step)

                # print
                print('{:-4d}\t| epoch {:2d}\t| step {:4d}\t| loss_basic: {:.4f}\t|'
                      ' loss: {:.4f}\t| PSNR: {:.2f}dB\t| SSIM: {:.4f}\t| time:{:.2f} seconds.'
                      .format(global_step, epoch, step, loss_basic, loss, psnr, ssim, time.time()-t1))
                t1 = time.time()


            if global_step % save_freq == 0:
                if average_loss.get_value() < best_loss:
                    is_best = True
                    best_loss = average_loss.get_value()
                else:
                    is_best = False

                save_dict = {
                    'epoch': epoch,
                    'global_iter': global_step,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': scheduler.state_dict()
                }
                save_checkpoint(
                    save_dict, is_best, checkpoint_dir, global_step, max_keep=10
                )
            global_step += 1
        print('Epoch {} is finished, time elapsed {:.2f} seconds.'.format(epoch, time.time()-epoch_start_time))
        lr_cur = [param['lr'] for param in optimizer.param_groups]
        if lr_cur[0] > 5e-6:
            scheduler.step()
        else:
            for param in optimizer.param_groups:
                param['lr'] = 5e-6
Beispiel #6
0
def train(config, restart_training, num_workers, num_threads):
    torch.set_num_threads(num_threads)
    print("Using {} CPU threads".format(torch.get_num_threads()))

    # TODO: de-hardcode this one.
    N_CHANNEL = 3
    train_config = config["training"]

    batch_size = train_config["batch_size"]
    lr = train_config["learning_rate"]
    w_decay = train_config["weight_decay"]
    step_size = train_config["decay_steps"]
    gamma = train_config["lr_decay"]
    betas = (train_config["beta1"], train_config["beta2"])
    n_epochs = train_config["num_epochs"]

    dataset_configs = train_config["dataset_configs"]
    use_cache = train_config["use_cache"]

    print("Configs:", config)
    # create dir for model
    checkpoint_dir = train_config["checkpoint_dir"]
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    logger = Logger(train_config["logs_dir"])

    use_gpu = torch.cuda.is_available()
    num_gpu = list(range(torch.cuda.device_count()))

    print("Using On the fly TRAIN datasets")
    train_data = OnTheFlyDataset(train_config["dataset_configs"],
                                 im_size=(train_config["image_width"],
                                          train_config["image_height"]),
                                 use_cache=use_cache)

    train_loader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_workers)

    model = get_model(config["architecture"])

    l1_loss = nn.SmoothL1Loss()

    if use_gpu:
        ts = time.time()
        model = model.cuda()
        model = nn.DataParallel(model, device_ids=num_gpu)
        print("Finish cuda loading, time elapsed {}".format(time.time() - ts))

    # for sanity check
    all_parameters = [
        p for n, p in model.named_parameters() if p.requires_grad
    ]
    if train_config["optimizer"] == "adam":
        print("Using Adam.")
        optimizer = optim.Adam([
            {
                'params': all_parameters
            },
        ],
                               lr=lr,
                               betas=betas,
                               weight_decay=w_decay,
                               amsgrad=True)
    elif train_config["optimizer"] == "sgd":
        print("Using SGD.")
        optimizer = optim.SGD([
            {
                'params': all_parameters
            },
        ],
                              lr=lr,
                              momentum=betas[0],
                              weight_decay=w_decay)
    else:
        raise ValueError(
            "Optimizer must be 'sgd' or 'adam', received '{}'".format(
                train_config["optimizer"]))

    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=step_size,
                                    gamma=gamma)

    n_global_iter = 0
    average_loss = MovingAverage(train_config["n_loss_average"])
    best_loss = np.inf
    checkpoint_loaded = False
    if not restart_training:
        try:
            checkpoint = load_checkpoint(checkpoint_dir, 'best')
            start_epoch = checkpoint['epoch']
            n_global_iter = checkpoint['global_iter']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            checkpoint_loaded = True
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        except:
            start_epoch = 0
            n_global_iter = 0
            best_loss = np.inf
            print("=> load checkpoint failed, training from scratch")
    else:
        start_epoch = 0
        print("=> training from scratch")

    for epoch in range(start_epoch, n_epochs):
        scheduler.step()
        ts = time.time()
        t4 = None
        t_generate_data = []
        t_train_disc = []
        t_train_gen = []
        t_vis = []
        t_save = []

        for iter, batch in enumerate(train_loader):
            if t4 is not None:
                # collect information and print out average time.
                t0_old = t0
            t0 = time.time()
            if t4 is not None:
                t_generate_data.append(t0 - t4)
                t_train_disc.append(t1 - t0_old)
                t_train_gen.append(t2 - t1)
                t_vis.append(t3 - t2)
                t_save.append(t4 - t3)
                N_report = 100
                N_print = 1000
                if (iter % N_report) == 0:
                    t_generate_data = np.mean(t_generate_data)
                    t_train_disc = np.mean(t_train_disc)
                    t_train_gen = np.mean(t_train_gen)
                    t_vis = np.mean(t_vis)
                    t_save = np.mean(t_save)
                    t_total = t_generate_data + t_train_disc + t_train_gen + t_vis + t_save
                    if (iter % N_print) == 0:
                        print("t_generate_data: {:0.4g} s ({:0.4g}%)".format(
                            t_generate_data, t_generate_data / t_total * 100))
                        print("t_train_disc: {:0.4g} s ({:0.4g}%)".format(
                            t_train_disc, t_train_disc / t_total * 100))
                        print("t_train_gen: {:0.4g} s ({:0.4g}%)".format(
                            t_train_gen, t_train_gen / t_total * 100))
                        print("t_vis: {:0.4g} s ({:0.4g}%)".format(
                            t_vis, t_vis / t_total * 100))
                        print("t_save: {:0.4g} s ({:0.4g}%)".format(
                            t_save, t_save / t_total * 100))
                    logger.scalar_summary('Steps per sec', 1.0 / t_total,
                                          n_global_iter)
                    t_generate_data = []
                    t_train_disc = []
                    t_train_gen = []
                    t_vis = []
                    t_save = []

            should_vis = ((n_global_iter + 1) % train_config["vis_freq"]) == 0
            if use_gpu:
                degraded_img = batch['degraded_img'].cuda()
                target_img = batch['original_img'].cuda()
            else:
                degraded_img = batch['degraded_img']
                target_img = batch['original_img']
            t1 = time.time()

            optimizer.zero_grad()
            # Run the input through the model.
            output_img = model(degraded_img)
            loss = l1_loss(output_img, target_img)
            loss.backward()
            optimizer.step()
            logger.scalar_summary('Loss', loss.data[0], n_global_iter)
            psnr = calculate_psnr(output_img, target_img)
            logger.scalar_summary('Train PSNR', psnr, n_global_iter)

            average_loss.update(loss.data[0])
            t2 = time.time()

            if iter % 10 == 0:
                print("epoch{}, iter{}, loss: {}" \
                        .format(epoch, iter, loss.data[0]))
            n_global_iter += 1

            if should_vis:
                exp = batch['vis_exposure'] if 'vis_exposure' in batch else None
                img = create_vis(degraded_img[:, :3, ...], target_img,
                                 output_img, exp)
                logger.image_summary("Train Images", img, n_global_iter)

            t3 = time.time()
            if (n_global_iter % train_config["save_freq"]) == 0:
                if average_loss.get_value() < best_loss:
                    is_best = True
                    best_loss = average_loss.get_value()
                else:
                    is_best = False
                save_dict = {
                    'epoch': epoch,
                    'global_iter': n_global_iter,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                }
                save_checkpoint(save_dict, is_best, checkpoint_dir,
                                n_global_iter)
            t4 = time.time()

        print("Finish epoch {}, time elapsed {}" \
                .format(epoch, time.time() - ts))