Esempio n. 1
0
def train(**kwargs):
    opt.parse(kwargs)
    # log file
    ps = PlotSaver("Context_Baseline_p32_no_imp_plain_1_" +
                   time.strftime("%m_%d_%H:%M:%S") + ".log.txt")

    # step1: Model
    model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        model_name="Context_Baseline_p32_imp_r={r}_gama={w}_plain".format(
            r=opt.rate_loss_threshold, w=opt.rate_loss_weight)
        if opt.use_imp else "ContextBaseNoImpP32_Plain")

    if opt.use_gpu:
        model = multiple_gpu_process(model)

    cudnn.benchmark = True

    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    data_transforms = transforms.Compose([transforms.ToTensor(), normalize])
    train_data = datasets.ImageFolder(opt.train_data_root, data_transforms)
    val_data = datasets.ImageFolder(opt.val_data_root, data_transforms)
    # opt.batch_size  --> 1
    train_dataloader = DataLoader(train_data,
                                  1,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                1,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)

    if opt.use_imp:
        rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)

    lr = opt.lr

    optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

    start_epoch = 0

    if opt.resume:
        if use_data_parallel:
            start_epoch = model.module.load(
                None if opt.finetune else optimizer, opt.resume, opt.finetune)
        else:
            start_epoch = model.load(None if opt.finetune else optimizer,
                                     opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_total_loss")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):

        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter
        # cur_epoch_loss refresh every epoch

        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")

        # Init val
        if (epoch == start_epoch + 1) and opt.init_val:
            print('Init validation ... ')
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, mse_loss, rate_loss, ps)
            else:
                mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            # make plot
            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss:{val_mse_loss}'
                    .format(
                        epoch=epoch,
                        lr=lr,
                        # train_mse_loss = mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        model.train()

        def to_patches(x, patch_size):
            print(type(x))
            _, h, w = x.size()
            print('original h,w', h, w)
            dw = (patch_size - w % patch_size) if w % patch_size else 0
            dh = (patch_size - h % patch_size) if h % patch_size else 0

            x = F.pad(x, (dw // 2, dw - dw // 2, dh // 2, dh - dh // 2))

            _, h, w = x.size()
            print(h, w)
            pdb.set_trace()

            tv.utils.save_image(x.data, 'test_images/padded_original_img.png')

            num_patch_x = w // patch_size
            num_patch_y = h // patch_size
            print(num_patch_x, num_patch_y)
            patches = []
            for i in range(num_patch_y):
                for j in range(num_patch_x):
                    patch = x[:, i * patch_size:(i + 1) * patch_size,
                              j * patch_size:(j + 1) * patch_size]
                    patches.append(patch.contiguous())
                if (j + 1) * patch_size < w:
                    extra_patch = x[:, i * patch_size:(i + 1) * patch_size,
                                    (j + 1) * patch_size:w]
                    extra_patch

            return patches

        # _ is corresponding Label, compression doesn't use it.
        for idx, (data, _) in enumerate(train_dataloader):

            if idx == 0:
                print('skip idx =', idx)
                continue
            # ipt = Variable(data[0])
            ipt = data[0]
            # if opt.use_gpu:
            #     # if not use_data_parallel:  # because ipt is also target, so we still need to transfer it to CUDA
            #         ipt = ipt.cuda(async = True)

            # ipt is a full image, so I need to split it into crops
            # so set batch_size = 1 for simplicity at first

            # pdb.set_trace()

            tv.utils.save_image(ipt, "test_imgs/original.png")
            patches = to_patches(ipt, opt.patch_size)

            for (i, p) in enumerate(patches):
                tv.utils.save_image(p, "test_imgs/%s.png" % i)

            pdb.set_trace()

            optimizer.zero_grad()
            # reconstructed, imp_mask_sigmoid = model(ipt)
            reconstructed = model(ipt)

            # print ('imp_mask_height', model.imp_mask_height)
            # pdb.set_trace()

            # print ('type recons', type(reconstructed.data))
            loss = mse_loss(reconstructed, ipt)
            caffe_loss = loss / (2 * opt.batch_size)

            if opt.use_imp:
                rate_loss_display = imp_mask_sigmoid
                rate_loss_ = rate_loss(rate_loss_display)

                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            # 1.
            total_loss.backward()
            # caffe_loss.backward()
            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(rate_loss_display.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                # print (rate_loss_display.data.mean())
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else rate_loss_display.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])
                # pdb.set_trace()
                # progress_bar.set_description('epoch %d/%d, loss = %.2f' % (epoch, opt.max_epoch, total_loss.data[0]))

                #  2.
                # ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' % (epoch, opt.max_epoch, idx, len(train_dataloader), total_loss_meter.value()[0], lr))
                # ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' % (epoch, opt.max_epoch, idx, len(train_dataloader), mse_loss_meter.value()[0], lr))

                # ps.log('loss = %f' % caffe_loss.data[0])
                # print(total_loss.data[0])
                # input('waiting......')

                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))
                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    # import pdb
                    pdb.set_trace()

        if use_data_parallel:
            # print (type(model.module))
            # print (model)
            # print (type(model))
            model.module.save(optimizer, epoch)
        else:
            model.save(optimizer, epoch)

        # print ('case error', total_loss.data[0])
        # print ('smoothed error', total_loss_meter.value()[0])

        # plot before val can ease me
        ps.make_plot('train mse loss'
                     )  # all epoch share a same img, so give "" to epoch
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        # val
        if opt.use_imp:
            mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                model, val_dataloader, mse_loss, rate_loss, ps)
        else:
            mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

        ps.add_point('val mse loss', mse_val_loss)
        if opt.use_imp:
            ps.add_point('val rate value', rate_val_display)
            ps.add_point('val rate loss', rate_val_loss)
            ps.add_point('val total loss', total_val_loss)

        # make plot
        # ps.make_plot('train mse loss', "")   # all epoch share a same img, so give "" to epoch
        # ps.make_plot('cur epoch train mse loss',epoch)
        ps.make_plot('val mse loss')

        if opt.use_imp:
            # ps.make_plot("train rate value","")
            # ps.make_plot("train rate loss","")
            # ps.make_plot("train total loss","")
            ps.make_plot('val rate value')
            ps.make_plot('val rate loss')
            ps.make_plot('val total loss')

        # log sth.
        if opt.use_imp:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        train_rate_loss=rate_loss_meter.value()[0],
                        train_total_loss=total_loss_meter.value()[0],
                        train_rate_display=rate_display_meter.value()[0],
                        val_mse_loss=mse_val_loss,
                        val_rate_loss=rate_val_loss,
                        val_total_loss=total_val_loss,
                        val_rate_display=rate_val_display))
        else:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        # update learning rate
        # if loss_meter.value()[0] > previous_loss:
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Due to early stop anneal lr to', lr, 'at epoch',
                          epoch)
                    ps.log('Due to early stop anneal lr to %.10f at epoch %d' %
                           (lr, epoch))

            else:
                tolerant_now -= 1

        # if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
        #     same_lr_epoch = 0
        #     tolerant_now = 0
        #     lr = lr * opt.lr_decay
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr
        #     print ('Due to full epochs anneal lr to',lr,'at epoch',epoch)
        #     ps.log ('Due to full epochs anneal lr to %.10f at epoch %d' % (lr, epoch))

        if opt.use_file_decay_lr and os.path.exists(opt.lr_decay_file):
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        # previous_loss = total_loss_meter.value()[0] if opt.use_imp else mse_loss_meter.value()[0]
        previous_loss = total_loss_meter.value()[0]
def train(**kwargs):
    opt.parse(kwargs)
    # log file
    ps = PlotSaver("FrozenCNN_ResNet50_RGB_" +
                   time.strftime("%m_%d_%H:%M:%S") + ".log.txt")

    # step1: Model
    compression_model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        model_name="CWCNN_limu_ImageNet_imp_r={r}_γ={w}_for_resnet50".format(
            r=opt.rate_loss_threshold, w=opt.rate_loss_weight)
        if opt.use_imp else None)

    compression_model.load(None, opt.compression_model_ckpt)
    compression_model.eval()

    # if use_original_RGB:
    #     resnet_50 = resnet50()    # Official ResNet
    # else:
    #     resnet_50 = ResNet50()    # My ResNet
    c_resnet_51 = cResNet51()
    if opt.use_gpu:
        # compression_model.cuda()
        # resnet_50.cuda()
        compression_model = multiple_gpu_process(compression_model)
        c_resnet_51 = multiple_gpu_process(c_resnet_51)

    # freeze the compression network
    for param in compression_model.parameters():
        # print (param.requires_grad)
        param.requires_grad = False

    cudnn.benchmark = True

    # pdb.set_trace()

    # step2: Data
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])
    val_data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(), normalize
    ])

    train_data = datasets.ImageFolder(opt.train_data_root,
                                      train_data_transforms)
    val_data = datasets.ImageFolder(opt.val_data_root, val_data_transforms)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    class_loss = t.nn.CrossEntropyLoss()
    lr = opt.lr
    # optimizer = t.optim.Adam(resnet_50.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=opt.weight_decay)
    optimizer = t.optim.SGD(c_resnet_51.parameters(),
                            lr=lr,
                            momentum=opt.momentum,
                            weight_decay=opt.weight_decay)
    start_epoch = 0

    if opt.resume:
        start_epoch = c_resnet_51.module.load(
            None if opt.finetune else optimizer, opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    class_loss_meter = AverageValueMeter()

    class_acc_top5_meter = AverageValueMeter()
    class_acc_top1_meter = AverageValueMeter()

    # class_loss_meter = AverageMeter()
    # class_acc_top5_meter = AverageMeter()
    # class_acc_top1_meter = AverageMeter()

    # ps init

    ps.new_plot('train class loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_CE_loss")
    ps.new_plot('val class loss', 1, xlabel="epoch", ylabel="val_CE_loss")

    ps.new_plot('train top_5 acc',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_top5_acc")
    ps.new_plot('train top_1 acc',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_top1_acc")

    ps.new_plot('val top_5 acc', 1, xlabel="iteration", ylabel="val_top_5_acc")
    ps.new_plot('val top_1 acc', 1, xlabel="iteration", ylabel="val_top_1_acc")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):
        # per epoch avg loss meter
        class_loss_meter.reset()

        class_acc_top1_meter.reset()
        class_acc_top5_meter.reset()

        # cur_epoch_loss refresh every epoch

        ps.new_plot("cur epoch train class loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="cur_train_CE_loss")

        c_resnet_51.train()

        for idx, (data, label) in enumerate(train_dataloader):
            ipt = Variable(data)
            label = Variable(label)

            if opt.use_gpu:
                ipt = ipt.cuda()
                label = label.cuda()

            optimizer.zero_grad()
            # if not use_original_RGB:
            # compressed_RGB = compression_model(ipt)
            # else:
            # compressed_RGB = ipt
            # We just wanna compressed features, not to decode this.
            compressed_feat = compression_model(ipt, need_decode=False)
            # print ('RGB', compressed_RGB.requires_grad)
            predicted = c_resnet_51(compressed_feat)

            class_loss_ = class_loss(predicted, label)

            class_loss_.backward()
            optimizer.step()

            class_loss_meter.add(class_loss_.data[0])
            # class_loss_meter.update(class_loss_.data[0], ipt.size(0))

            acc1, acc5 = accuracy(predicted.data, label.data, topk=(1, 5))
            # pdb.set_trace()

            class_acc_top1_meter.add(acc1[0])
            class_acc_top5_meter.add(acc5[0])
            # class_acc_top1_meter.update(acc1[0], ipt.size(0))
            # class_acc_top5_meter.update(acc5[0], ipt.size(0))

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train class loss',
                    class_loss_meter.value()[0]
                    if opt.print_smooth else class_loss_.data[0])
                ps.add_point(
                    'cur epoch train class loss',
                    class_loss_meter.value()[0]
                    if opt.print_smooth else class_loss_.data[0])
                ps.add_point(
                    'train top_5 acc',
                    class_acc_top5_meter.value()[0]
                    if opt.print_smooth else acc5[0])
                ps.add_point(
                    'train top_1 acc',
                    class_acc_top1_meter.value()[0]
                    if opt.print_smooth else acc1[0])

                ps.log(
                    'Epoch %d/%d, Iter %d/%d, class loss = %.4f, top 5 acc = %.2f %%, top 1 acc  = %.2f %%, lr = %.8f'
                    % (epoch, opt.max_epoch, idx, len(train_dataloader),
                       class_loss_meter.value()[0],
                       class_acc_top5_meter.value()[0],
                       class_acc_top1_meter.value()[0], lr))
                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    pdb.set_trace()

        if use_data_parallel:
            c_resnet_51.module.save(optimizer, epoch)

        # plot before val can ease me
        ps.make_plot(
            'train class loss'
        )  # all epoch share a same img, so give ""(default) to epoch
        ps.make_plot('cur epoch train class loss', epoch)
        ps.make_plot("train top_5 acc")
        ps.make_plot("train top_1 acc")

        val_class_loss, val_top5_acc, val_top1_acc = val(
            compression_model, c_resnet_51, val_dataloader, class_loss, None,
            ps)

        ps.add_point('val class loss', val_class_loss)
        ps.add_point('val top_5 acc', val_top5_acc)
        ps.add_point('val top_1 acc', val_top1_acc)

        ps.make_plot('val class loss')
        ps.make_plot('val top_5 acc')
        ps.make_plot('val top_1 acc')

        ps.log(
            'Epoch:{epoch}, lr:{lr}, train_class_loss: {train_class_loss}, train_top5_acc: {train_top5_acc} %, train_top1_acc: {train_top1_acc} %, \
val_class_loss: {val_class_loss}, val_top5_acc: {val_top5_acc} %, val_top1_acc: {val_top1_acc} %'
            .format(epoch=epoch,
                    lr=lr,
                    train_class_loss=class_loss_meter.value()[0],
                    train_top5_acc=class_acc_top5_meter.value()[0],
                    train_top1_acc=class_acc_top1_meter.value()[0],
                    val_class_loss=val_class_loss,
                    val_top5_acc=val_top5_acc,
                    val_top1_acc=val_top1_acc))

        # adjust lr
        if epoch in opt.lr_decay_step_list:
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
def train(**kwargs):
    opt.parse(kwargs)
    # log file
    # tpse: test patch size effect
    ps = PlotSaver(opt.exp_id[4:] + "_" + time.strftime("%m_%d__%H:%M:%S") +
                   ".log.txt")

    # step1: Model
    model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        model_name="TPSE_p{ps}_imp_r={r}_gama={w}".format(
            ps=opt.patch_size,
            r=opt.rate_loss_threshold,
            w=opt.rate_loss_weight)
        if opt.use_imp else "TPSE_p{ps}_no_imp".format(ps=opt.patch_size))

    if opt.use_gpu:
        model = multiple_gpu_process(model)

    cudnn.benchmark = True

    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    train_data_transforms = transforms.Compose([
        transforms.RandomCrop(opt.patch_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    val_data_transforms = transforms.Compose(
        [transforms.CenterCrop(256),
         transforms.ToTensor(), normalize])

    train_data = datasets.ImageFolder(opt.train_data_root,
                                      train_data_transforms)
    val_data = datasets.ImageFolder(opt.val_data_root, val_data_transforms)

    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)
    if opt.use_imp:
        rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)
    lr = opt.lr
    optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

    start_epoch = 0

    if opt.resume:
        if use_data_parallel:
            start_epoch = model.module.load(
                None if opt.finetune else optimizer, opt.resume, opt.finetune)
        else:
            start_epoch = model.load(None if opt.finetune else optimizer,
                                     opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="epoch",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss', 1, xlabel="epoch", ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="epoch",
                    ylabel="val_total_loss")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):

        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter

        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")

        # Init val
        if (epoch == start_epoch + 1) and opt.init_val:
            print('Init validation ... ')
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, mse_loss, rate_loss, ps, -1, True)
            else:
                mse_val_loss = val(model, val_dataloader, mse_loss, None, ps,
                                   -1, True)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            # make plot
            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss:{val_mse_loss}'
                    .format(epoch=epoch, lr=lr, val_mse_loss=mse_val_loss))

        # 开始训练
        model.train()

        # _ is corresponding Label, compression doesn't use it.
        for idx, (data, _) in enumerate(train_dataloader):

            ipt = Variable(data)

            if opt.use_gpu:
                ipt = ipt.cuda(async=True)

            optimizer.zero_grad()

            reconstructed = model(ipt)

            loss = mse_loss(reconstructed, ipt)
            caffe_loss = loss / (2 * opt.batch_size)

            # 相对于8x8的SIZE
            caffe_loss = caffe_loss / (
                (opt.patch_size // 8)**2
            )  # 8 16 32 ... (size = 4 * size')   * / 8 = 1 2 4 ...   log2(*/8) = 0, 1, 2 ...  4^(log2(*/8)) = (*/8)^2

            if opt.use_imp:
                rate_loss_display = imp_mask_sigmoid
                rate_loss_ = rate_loss(rate_loss_display)

                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            total_loss.backward()

            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(rate_loss_display.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                # print (rate_loss_display.data.mean())
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else rate_loss_display.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])

                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))
                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    # import pdb
                    pdb.set_trace()

        if use_data_parallel:
            model.module.save(optimizer, epoch)
        else:
            model.save(optimizer, epoch)

        # plot before val can ease me
        ps.make_plot('train mse loss'
                     )  # all epoch share a same img, so give "" to epoch
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        # val
        if opt.use_imp:
            mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                model, val_dataloader, mse_loss, rate_loss, ps, epoch,
                epoch % opt.show_inf_imgs_T == 0)
        else:
            mse_val_loss = val(model, val_dataloader, mse_loss, None, ps,
                               epoch, epoch % opt.show_inf_imgs_T == 0)

        ps.add_point('val mse loss', mse_val_loss)
        if opt.use_imp:
            ps.add_point('val rate value', rate_val_display)
            ps.add_point('val rate loss', rate_val_loss)
            ps.add_point('val total loss', total_val_loss)

        ps.make_plot('val mse loss')

        if opt.use_imp:
            ps.make_plot('val rate value')
            ps.make_plot('val rate loss')
            ps.make_plot('val total loss')

        # log sth.
        if opt.use_imp:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        train_rate_loss=rate_loss_meter.value()[0],
                        train_total_loss=total_loss_meter.value()[0],
                        train_rate_display=rate_display_meter.value()[0],
                        val_mse_loss=mse_val_loss,
                        val_rate_loss=rate_val_loss,
                        val_total_loss=total_val_loss,
                        val_rate_display=rate_val_display))
        else:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        # update learning rate
        # if loss_meter.value()[0] > previous_loss:
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Due to early stop anneal lr to', lr, 'at epoch',
                          epoch)
                    ps.log('Due to early stop anneal lr to %.10f at epoch %d' %
                           (lr, epoch))

            else:
                tolerant_now -= 1

        if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
            same_lr_epoch = 0
            tolerant_now = 0
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            print('Due to full epochs anneal lr to', lr, 'at epoch', epoch)
            ps.log('Due to full epochs anneal lr to %.10f at epoch %d' %
                   (lr, epoch))

        if opt.use_file_decay_lr and os.path.exists(opt.lr_decay_file):
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        previous_loss = total_loss_meter.value()[0]
def train(**kwargs):
    opt.parse(kwargs)
    # vis = Visualizer(opt.env)
    # log file
    ps = PlotSaver("Train_ImageNet12_With_ImpMap_" +
                   time.strftime("%m_%d_%H:%M:%S") + ".log.txt")

    # step1: Model
    model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        model_name="CWCNN_limu_ImageNet_imp_r={r}_γ={w}".format(
            r=opt.rate_loss_threshold, w=opt.rate_loss_weight)
        if opt.use_imp else None)
    # if opt.use_imp else "test_pytorch")
    if opt.use_gpu:
        # model = multiple_gpu_process(model)
        model.cuda()


#    pdb.set_trace()

    cudnn.benchmark = True

    # step2: Data
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_data_transforms = transforms.Compose([
        transforms.Resize(256),
        #transforms.Scale(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ])
    val_data_transforms = transforms.Compose([
        transforms.Resize(256),
        #transforms.Scale(256),
        transforms.CenterCrop(224),
        # transforms.TenCrop(224),
        # transforms.Lambda(lambda crops: t.stack(([normalize(transforms.ToTensor()(crop)) for crop in crops]))),
        transforms.ToTensor(),
        normalize
    ])
    # train_data = ImageNet_200k(opt.train_data_root, train=True, transforms=data_transforms)
    # val_data = ImageNet_200k(opt.val_data_root, train = False, transforms=data_transforms)
    train_data = datasets.ImageFolder(opt.train_data_root,
                                      train_data_transforms)
    val_data = datasets.ImageFolder(opt.val_data_root, val_data_transforms)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)

    if opt.use_imp:
        # rate_loss = RateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)
        rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)

    lr = opt.lr

    optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

    start_epoch = 0

    if opt.resume:
        if hasattr(model, 'module'):
            start_epoch = model.module.load(
                None if opt.finetune else optimizer, opt.resume, opt.finetune)
        else:
            start_epoch = model.load(None if opt.finetune else optimizer,
                                     opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_total_loss")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):
        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter
        # cur_epoch_loss refresh every epoch

        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")

        model.train()

        # _ is corresponding Label, compression doesn't use it.
        for idx, (data, _) in enumerate(train_dataloader):
            ipt = Variable(data)

            if opt.use_gpu:
                ipt = ipt.cuda()

            optimizer.zero_grad()  # f**k it! Don't forget to clear grad!
            reconstructed = model(ipt)

            # print ('reconstructed tensor size :', reconstructed.size())
            loss = mse_loss(reconstructed, ipt)
            caffe_loss = loss / (2 * opt.batch_size)

            if opt.use_imp:
                # print ('use data_parallel?',use_data_parallel)
                # pdb.set_trace()
                rate_loss_display = (model.module if use_data_parallel else
                                     model).imp_mask_sigmoid
                rate_loss_ = rate_loss(rate_loss_display)
                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            total_loss.backward()

            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(rate_loss_display.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else rate_loss_display.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])
                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))
                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    pdb.set_trace()

        # data parallel
        # if hasattr(model, 'module'):
        if use_data_parallel:
            model.module.save(optimizer, epoch)
        else:
            model.save(optimizer, epoch)

        # plot before val can ease me
        ps.make_plot(
            'train mse loss'
        )  # all epoch share a same img, so give ""(default) to epoch
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        # val
        if opt.use_imp:
            mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                model, val_dataloader, mse_loss, rate_loss, ps)
        else:
            mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

        ps.add_point('val mse loss', mse_val_loss)
        if opt.use_imp:
            ps.add_point('val rate value', rate_val_display)
            ps.add_point('val rate loss', rate_val_loss)
            ps.add_point('val total loss', total_val_loss)

        ps.make_plot('val mse loss')

        if opt.use_imp:
            ps.make_plot('val rate value')
            ps.make_plot('val rate loss')
            ps.make_plot('val total loss')

        # log sth.
        if opt.use_imp:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        train_rate_loss=rate_loss_meter.value()[0],
                        train_total_loss=total_loss_meter.value()[0],
                        train_rate_display=rate_display_meter.value()[0],
                        val_mse_loss=mse_val_loss,
                        val_rate_loss=rate_val_loss,
                        val_total_loss=total_val_loss,
                        val_rate_display=rate_val_display))
        else:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Anneal lr to', lr, 'at epoch', epoch,
                          'due to early stop.')
                    ps.log(
                        'Anneal lr to %.10f at epoch %d due to early stop.' %
                        (lr, epoch))

            else:
                tolerant_now -= 1

        if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
            same_lr_epoch = 0
            tolerant_now = 0
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            print('Anneal lr to', lr, 'at epoch', epoch, 'due to full epochs.')
            ps.log('Anneal lr to %.10f at epoch %d due to full epochs.' %
                   (lr, epoch))

        previous_loss = total_loss_meter.value()[0]
def train(**kwargs):
    opt.parse(kwargs)
    # log file
    logfile_name = "Cmpr_with_YOLOv2_" + opt.exp_desc + time.strftime(
        "_%m_%d_%H:%M:%S") + ".log.txt"
    ps = PlotSaver(logfile_name)

    # step1: Model
    model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        n=opt.feat_num,
        input_4_ch=opt.input_4_ch,
        model_name="Cmpr_yolo_imp_" + opt.exp_desc + "_r={r}_gama={w}".format(
            r=opt.rate_loss_threshold, w=opt.rate_loss_weight)
        if opt.use_imp else "Cmpr_yolo_no_imp_" + opt.exp_desc)
    # pdb.set_trace()
    if opt.use_gpu:
        model = multiple_gpu_process(model)

    cudnn.benchmark = True

    # step2: Data
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    train_data_transforms = transforms.Compose([
        # transforms.RandomHorizontalFlip(),  TODO: try to reimplement by myself to simultaneous operate on label and data
        transforms.ToTensor(),
        normalize
    ])
    val_data_transforms = transforms.Compose(
        [transforms.ToTensor(), normalize])
    train_data = ImageCropWithBBoxMaskDataset(
        opt.train_data_list,
        train_data_transforms,
        contrastive_degree=opt.contrastive_degree,
        mse_bbox_weight=opt.input_original_bbox_weight)
    val_data = ImageCropWithBBoxMaskDataset(
        opt.val_data_list,
        val_data_transforms,
        contrastive_degree=opt.contrastive_degree,
        mse_bbox_weight=opt.input_original_bbox_weight)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)

    if opt.use_imp:
        # TODO: new rate loss
        rate_loss = RateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)
        # rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)

    def weighted_mse_loss(input, target, weight):
        # weight[weight!=opt.mse_bbox_weight] = 1
        # weight[weight==opt.mse_bbox_weight] = opt.mse_bbox_weight
        # print('max val', weight.max())
        # return mse_loss(input, target)
        # weight_clone = weight.clone()
        # weight_clone[weight_clone == opt.input_original_bbox_weight] = 0
        # return t.sum(weight_clone * (input - target) ** 2)
        weight_clone = t.ones_like(weight)
        weight_clone[weight ==
                     opt.input_original_bbox_inner] = opt.mse_bbox_weight
        return t.sum(weight_clone * (input - target)**2)

    def yolo_rate_loss(imp_map, mask_r):
        return rate_loss(imp_map)
        # V2 contrastive_degree must be 0!
        # return YoloRateLossV2(mask_r, opt.rate_loss_threshold, opt.rate_loss_weight)(imp_map)

    lr = opt.lr
    optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

    start_epoch = 0
    decay_file_create_time = -1  # 为了避免同一个文件反复衰减学习率, 所以判断修改时间

    if opt.resume:
        if use_data_parallel:
            start_epoch = model.module.load(
                None if opt.finetune else optimizer, opt.resume, opt.finetune)
        else:
            start_epoch = model.load(None if opt.finetune else optimizer,
                                     opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_total_loss")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):

        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter
        # cur_epoch_loss refresh every epoch
        # vis.refresh_plot('cur epoch train mse loss')
        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")
        # progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), ascii=True)
        # progress_bar.set_description('epoch %d/%d, loss = 0.00' % (epoch, opt.max_epoch))

        # Init val
        if (epoch == start_epoch + 1) and opt.init_val:
            print('Init validation ... ')
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, weighted_mse_loss, yolo_rate_loss,
                    ps)
            else:
                mse_val_loss = val(model, val_dataloader, weighted_mse_loss,
                                   None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss:{val_mse_loss}'
                    .format(epoch=epoch, lr=lr, val_mse_loss=mse_val_loss))

        if opt.only_init_val:
            print('Only Init Val Over!')
            return

        model.train()

        if epoch == start_epoch + 1:
            print('Start training, please inspect log file %s!' % logfile_name)
        # mask is the detection bounding box mask
        for idx, (data, mask, o_mask) in enumerate(train_dataloader):

            # pdb.set_trace()

            data = Variable(data)
            mask = Variable(mask)
            o_mask = Variable(o_mask, requires_grad=False)

            if opt.use_gpu:
                data = data.cuda(async=True)
                mask = mask.cuda(async=True)
                o_mask = o_mask.cuda(async=True)

            # pdb.set_trace()

            optimizer.zero_grad()
            reconstructed, imp_mask_sigmoid = model(data, mask, o_mask)

            # print ('imp_mask_height', model.imp_mask_height)
            # pdb.set_trace()

            # print ('type recons', type(reconstructed.data))

            loss = weighted_mse_loss(reconstructed, data, o_mask)
            # loss = mse_loss(reconstructed, data)
            caffe_loss = loss / (2 * opt.batch_size)

            if opt.use_imp:
                rate_loss_display = imp_mask_sigmoid
                # rate_loss_ =  rate_loss(rate_loss_display)
                rate_loss_ = yolo_rate_loss(rate_loss_display, mask)
                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            total_loss.backward()
            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(rate_loss_display.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else rate_loss_display.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])

                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))

                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    pdb.set_trace()

        if epoch % opt.save_interval == 0:
            print('save checkpoint file of epoch %d.' % epoch)
            if use_data_parallel:
                model.module.save(optimizer, epoch)
            else:
                model.save(optimizer, epoch)

        ps.make_plot('train mse loss')
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        if epoch % opt.eval_interval == 0:

            print('Validating ...')
            # val
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, weighted_mse_loss, yolo_rate_loss,
                    ps)
            else:
                mse_val_loss = val(model, val_dataloader, weighted_mse_loss,
                                   None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
    val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            train_mse_loss=mse_loss_meter.value()[0],
                            train_rate_loss=rate_loss_meter.value()[0],
                            train_total_loss=total_loss_meter.value()[0],
                            train_rate_display=rate_display_meter.value()[0],
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                    .format(epoch=epoch,
                            lr=lr,
                            train_mse_loss=mse_loss_meter.value()[0],
                            val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        # update learning rate
        # if loss_meter.value()[0] > previous_loss:
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Due to early stop anneal lr to %.10f at epoch %d' %
                          (lr, epoch))
                    ps.log('Due to early stop anneal lr to %.10f at epoch %d' %
                           (lr, epoch))

            else:
                tolerant_now -= 1

        if epoch % opt.lr_anneal_epochs == 0:
            # if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
            same_lr_epoch = 0
            tolerant_now = 0
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            print('Anneal lr to %.10f at epoch %d due to full epochs.' %
                  (lr, epoch))
            ps.log('Anneal lr to %.10f at epoch %d due to full epochs.' %
                   (lr, epoch))

        if opt.use_file_decay_lr and os.path.exists(opt.lr_decay_file):
            cur_mtime = os.path.getmtime(opt.lr_decay_file)
            if cur_mtime > decay_file_create_time:
                decay_file_create_time = cur_mtime
                lr = lr * opt.lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                print(
                    'Anneal lr to %.10f at epoch %d due to decay-file indicator.'
                    % (lr, epoch))
                ps.log(
                    'Anneal lr to %.10f at epoch %d due to decay-file indicator.'
                    % (lr, epoch))

        previous_loss = total_loss_meter.value()[0]
def train(**kwargs):
    opt.parse(kwargs)
    # vis = Visualizer(opt.env)
    # log file
    ps = PlotSaver("CLIC_finetune_With_TestSet_0.13_" +
                   time.strftime("%m_%d_%H:%M:%S") + ".log.txt")

    # step1: Model

    # CWCNN_limu_ImageNet_imp_
    model = getattr(models, opt.model)(
        use_imp=opt.use_imp,
        model_name="CLIC_imp_ft_testSet_2_r={r}_gama={w}".format(
            r=opt.rate_loss_threshold, w=opt.rate_loss_weight)
        if opt.use_imp else None)
    # if opt.use_imp else "test_pytorch")
    if opt.use_gpu:
        model = multiple_gpu_process(model)
        # model.cuda()  # because I want model.imp_mask_sigmoid

    # freeze the decoder network, and finetune the enc + imp with train + val set
    if opt.freeze_decoder:
        for param in model.module.decoder.parameters():
            # print (param.requires_grad)
            param.requires_grad = False

    cudnn.benchmark = True

    # step2: Data
    # normalize = transforms.Normalize(
    #                 mean=[0.485, 0.456, 0.406],
    #                 std=[0.229, 0.224, 0.225]
    #                 )

    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    # train_data_transforms = transforms.Compose(
    #     [
    #         transforms.Resize(256),
    #         # transforms.RandomCrop(128),
    #         # transforms.Resize((128,128)),
    #         # transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         # transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
    #         normalize
    #     ]
    # )
    # val_data_transforms = transforms.Compose(
    #     [
    #         # transforms.Resize(256),
    #         # transforms.CenterCrop(128),
    #         # transforms.Resize((128,128)),
    #         # transforms.TenCrop(224),
    #         # transforms.Lambda(lambda crops: t.stack(([normalize(transforms.ToTensor()(crop)) for crop in crops]))),
    #         transforms.ToTensor(),
    #         normalize
    #     ]
    # )

    train_data_transforms = transforms.Compose([
        transforms.Resize(256),
        #transforms.Scale(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ])
    val_data_transforms = transforms.Compose([
        transforms.Resize(256),
        #transforms.Scale(256),
        transforms.CenterCrop(224),
        # transforms.TenCrop(224),
        # transforms.Lambda(lambda crops: t.stack(([normalize(transforms.ToTensor()(crop)) for crop in crops]))),
        transforms.ToTensor(),
        normalize
    ])
    # train_data = ImageNet_200k(opt.train_data_root, train=True, transforms=data_transforms)
    # val_data = ImageNet_200k(opt.val_data_root, train = False, transforms=data_transforms)
    train_data = datasets.ImageFolder(opt.train_data_root,
                                      train_data_transforms)
    val_data = datasets.ImageFolder(opt.val_data_root, val_data_transforms)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    # train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers)

    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)
    # val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)

    if opt.use_imp:
        # rate_loss = RateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)
        rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)

    lr = opt.lr

    # print ('model.parameters():')
    # print (model.parameters())

    # optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))
    optimizer = t.optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                             lr=lr,
                             betas=(0.9, 0.999))

    start_epoch = 0

    if opt.resume:
        if use_data_parallel:
            start_epoch = model.module.load(
                None if opt.finetune else optimizer, opt.resume, opt.finetune)
        else:
            start_epoch = model.load(None if opt.finetune else optimizer,
                                     opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training at epoch %d.' % start_epoch)

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_total_loss")

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):

        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter
        # cur_epoch_loss refresh every epoch
        # vis.refresh_plot('cur epoch train mse loss')
        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")
        # progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), ascii=True)
        # progress_bar.set_description('epoch %d/%d, loss = 0.00' % (epoch, opt.max_epoch))

        # Init val
        if (epoch == start_epoch + 1) and opt.init_val:
            print('Init validation ... ')
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, mse_loss, rate_loss, ps)
            else:
                mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            # make plot
            # ps.make_plot('train mse loss', "")   # all epoch share a same img, so give "" to epoch
            # ps.make_plot('cur epoch train mse loss',epoch)
            ps.make_plot('val mse loss')

            if opt.use_imp:
                # ps.make_plot("train rate value","")
                # ps.make_plot("train rate loss","")
                # ps.make_plot("train total loss","")
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss:{val_mse_loss}'
                    .format(
                        epoch=epoch,
                        lr=lr,
                        # train_mse_loss = mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        model.train()

        # mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

        # _ is corresponding Label, compression doesn't use it.
        for idx, (data, _) in enumerate(train_dataloader):
            ipt = Variable(data)

            if opt.use_gpu:
                # if not use_data_parallel:  # because ipt is also target, so we still need to transfer it to CUDA
                ipt = ipt.cuda(async=True)

            optimizer.zero_grad()
            reconstructed, imp_mask_sigmoid = model(ipt)

            # print ('imp_mask_height', model.imp_mask_height)
            # pdb.set_trace()

            # print ('type recons', type(reconstructed.data))
            loss = mse_loss(reconstructed, ipt)
            caffe_loss = loss / (2 * opt.batch_size)

            if opt.use_imp:
                # rate_loss_display = model.imp_mask_sigmoid
                # rate_loss_display = (model.module if use_data_parallel else model).imp_mask_sigmoid
                rate_loss_display = imp_mask_sigmoid
                # rate_loss_display = model.imp_mask
                # print ('type of display', type(rate_loss_display.data))
                rate_loss_ = rate_loss(rate_loss_display)
                # print (
                #     'type of rate_loss_value',
                #     type(rate_loss_value.data)
                # )
                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            # 1.
            total_loss.backward()
            # caffe_loss.backward()
            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(rate_loss_display.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                # print (rate_loss_display.data.mean())
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else rate_loss_display.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])
                # pdb.set_trace()
                # progress_bar.set_description('epoch %d/%d, loss = %.2f' % (epoch, opt.max_epoch, total_loss.data[0]))

                #  2.
                # ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' % (epoch, opt.max_epoch, idx, len(train_dataloader), total_loss_meter.value()[0], lr))
                # ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' % (epoch, opt.max_epoch, idx, len(train_dataloader), mse_loss_meter.value()[0], lr))

                # ps.log('loss = %f' % caffe_loss.data[0])
                # print(total_loss.data[0])
                # input('waiting......')

                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))
                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    # import pdb
                    pdb.set_trace()

        if use_data_parallel:
            # print (type(model.module))
            # print (model)
            # print (type(model))
            model.module.save(optimizer, epoch)
        else:
            model.save(optimizer, epoch)

        # print ('case error', total_loss.data[0])
        # print ('smoothed error', total_loss_meter.value()[0])

        # plot before val can ease me
        ps.make_plot('train mse loss'
                     )  # all epoch share a same img, so give "" to epoch
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        # val
        if opt.use_imp:
            mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                model, val_dataloader, mse_loss, rate_loss, ps)
        else:
            mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

        ps.add_point('val mse loss', mse_val_loss)
        if opt.use_imp:
            ps.add_point('val rate value', rate_val_display)
            ps.add_point('val rate loss', rate_val_loss)
            ps.add_point('val total loss', total_val_loss)

        # make plot
        # ps.make_plot('train mse loss', "")   # all epoch share a same img, so give "" to epoch
        # ps.make_plot('cur epoch train mse loss',epoch)
        ps.make_plot('val mse loss')

        if opt.use_imp:
            # ps.make_plot("train rate value","")
            # ps.make_plot("train rate loss","")
            # ps.make_plot("train total loss","")
            ps.make_plot('val rate value')
            ps.make_plot('val rate loss')
            ps.make_plot('val total loss')

        # log sth.
        if opt.use_imp:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        train_rate_loss=rate_loss_meter.value()[0],
                        train_total_loss=total_loss_meter.value()[0],
                        train_rate_display=rate_display_meter.value()[0],
                        val_mse_loss=mse_val_loss,
                        val_rate_loss=rate_val_loss,
                        val_total_loss=total_val_loss,
                        val_rate_display=rate_val_display))
        else:
            ps.log(
                'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                .format(epoch=epoch,
                        lr=lr,
                        train_mse_loss=mse_loss_meter.value()[0],
                        val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        # update learning rate
        # if loss_meter.value()[0] > previous_loss:
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Due to early stop anneal lr to', lr, 'at epoch',
                          epoch)
                    ps.log('Due to early stop anneal lr to %.10f at epoch %d' %
                           (lr, epoch))

            else:
                tolerant_now -= 1

        # if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
        #     same_lr_epoch = 0
        #     tolerant_now = 0
        #     lr = lr * opt.lr_decay
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr
        #     print ('Due to full epochs anneal lr to',lr,'at epoch',epoch)
        #     ps.log ('Due to full epochs anneal lr to %.10f at epoch %d' % (lr, epoch))

        if opt.use_file_decay_lr and os.path.exists(opt.lr_decay_file):
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        # previous_loss = total_loss_meter.value()[0] if opt.use_imp else mse_loss_meter.value()[0]
        previous_loss = total_loss_meter.value()[0]
def train(**kwargs):
    global batch_model_id
    opt.parse(kwargs)
    if opt.use_batch_process:
        max_bp_times = len(opt.exp_ids)
        if batch_model_id >= max_bp_times:
            print('Batch Processing Done!')
            return
        else:
            print('Cur Batch Processing ID is %d/%d.' %
                  (batch_model_id + 1, max_bp_times))
            opt.r = opt.r_s[batch_model_id]
            opt.exp_id = opt.exp_ids[batch_model_id]
            opt.exp_desc = opt.exp_desc_LUT[opt.exp_id]
            opt.plot_path = "plot/plot_%d" % opt.exp_id
            print('Cur Model(exp_%d) r = %f, desc = %s. ' %
                  (opt.exp_id, opt.r, opt.exp_desc))
    opt.make_new_dirs()
    # log file
    EvalVal = opt.only_init_val and opt.init_val and not opt.test_test
    EvalTest = opt.only_init_val and opt.init_val and opt.test_test
    EvalSuffix = ""
    if EvalVal:
        EvalSuffix = "_val"
    if EvalTest:
        EvalSuffix = "_test"
    logfile_name = opt.exp_desc + time.strftime(
        "_%m_%d_%H:%M:%S") + EvalSuffix + ".log.txt"

    ps = PlotSaver(logfile_name, log_to_stdout=opt.log_to_stdout)

    # step1: Model
    # model = getattr(models, opt.model)(use_imp = opt.use_imp, n = opt.feat_num, model_name=opt.exp_desc + ("_r={r}_gm={w}".format(
    #                                                             r=opt.rate_loss_threshold,
    #                                                             w=opt.rate_loss_weight)
    #                                                           if opt.use_imp else "_no_imp"))

    model = getattr(models, opt.model)(use_imp=opt.use_imp,
                                       n=opt.feat_num,
                                       model_name=opt.exp_desc)
    # print (model)
    # pdb.set_trace()
    global use_data_parallel
    if opt.use_gpu:
        model, use_data_parallel = multiple_gpu_process(model)

    # real use gpu or cpu
    opt.use_gpu = opt.use_gpu and use_data_parallel >= 0
    cudnn.benchmark = True

    # step2: Data
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    train_data_transforms = transforms.Compose([
        # transforms.RandomHorizontalFlip(),  TODO: try to reimplement by myself to simultaneous operate on label and data
        transforms.RandomCrop(128),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ])
    val_data_transforms = transforms.Compose(
        [transforms.CenterCrop(128),
         transforms.ToTensor(), normalize])

    caffe_data_transforms = transforms.Compose([transforms.CenterCrop(128)])
    # transforms ||  data
    train_data = ImageFilelist(
        flist=opt.train_data_list,
        transform=train_data_transforms,
    )

    val_data = ImageFilelist(
        flist=opt.val_data_list,
        prefix=opt.val_data_prefix,
        transform=val_data_transforms,
    )

    val_data_caffe = ImageFilelist(flist=opt.val_data_list,
                                   transform=caffe_data_transforms)

    test_data_caffe = ImageFilelist(flist=opt.test_data_list,
                                    transform=caffe_data_transforms)

    if opt.make_caffe_data:
        save_caffe_data(test_data_caffe)
        print('Make caffe dataset over!')
        return

    # train_data = ImageCropWithBBoxMaskDataset(
    #     opt.train_data_list,
    #     train_data_transforms,
    #     contrastive_degree = opt.contrastive_degree,
    #     mse_bbox_weight = opt.input_original_bbox_weight
    # )
    # val_data = ImageCropWithBBoxMaskDataset(
    #     opt.val_data_list,
    #     val_data_transforms,
    #     contrastive_degree = opt.contrastive_degree,
    #     mse_bbox_weight = opt.input_original_bbox_weight
    # )
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=True)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True)

    # step3: criterion and optimizer

    mse_loss = t.nn.MSELoss(size_average=False)

    if opt.use_imp:
        # TODO: new rate loss
        rate_loss = RateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)
        # rate_loss = LimuRateLoss(opt.rate_loss_threshold, opt.rate_loss_weight)

    def weighted_mse_loss(input, target, weight):
        # weight[weight!=opt.mse_bbox_weight] = 1
        # weight[weight==opt.mse_bbox_weight] = opt.mse_bbox_weight
        # print('max val', weight.max())
        # return mse_loss(input, target)
        # weight_clone = weight.clone()
        # weight_clone[weight_clone == opt.input_original_bbox_weight] = 0
        # return t.sum(weight_clone * (input - target) ** 2)
        weight_clone = t.ones_like(weight)
        weight_clone[weight ==
                     opt.input_original_bbox_inner] = opt.mse_bbox_weight
        return t.sum(weight_clone * (input - target)**2)

    def yolo_rate_loss(imp_map, mask_r):
        return rate_loss(imp_map)
        # V2 contrastive_degree must be 0!
        # return YoloRateLossV2(mask_r, opt.rate_loss_threshold, opt.rate_loss_weight)(imp_map)

    start_epoch = 0
    decay_file_create_time = -1  # 为了避免同一个文件反复衰减学习率, 所以判断修改时间

    previous_loss = 1e100
    tolerant_now = 0
    same_lr_epoch = 0
    lr = opt.lr
    optimizer = t.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

    if opt.resume:
        start_epoch = (model.module if use_data_parallel == 1 else model).load(
            None if opt.finetune else optimizer, opt.resume, opt.finetune)

        if opt.finetune:
            print('Finetune from model checkpoint file', opt.resume)
        else:
            print('Resume training from checkpoint file', opt.resume)
            print('Continue training from epoch %d.' % start_epoch)
            same_lr_epoch = start_epoch % opt.lr_anneal_epochs
            decay_times = start_epoch // opt.lr_anneal_epochs
            lr = opt.lr * (opt.lr_decay**decay_times)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            print('Decay lr %d times, now lr is %e.' % (decay_times, lr))

    # step4: meters
    mse_loss_meter = AverageValueMeter()
    if opt.use_imp:
        rate_loss_meter = AverageValueMeter()
        rate_display_meter = AverageValueMeter()
        total_loss_meter = AverageValueMeter()

    # ps init

    ps.new_plot('train mse loss',
                opt.print_freq,
                xlabel="iteration",
                ylabel="train_mse_loss")
    ps.new_plot('val mse loss', 1, xlabel="epoch", ylabel="val_mse_loss")
    if opt.use_imp:
        ps.new_plot('train rate value',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_value")
        ps.new_plot('train rate loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_rate_loss")
        ps.new_plot('train total loss',
                    opt.print_freq,
                    xlabel="iteration",
                    ylabel="train_total_loss")
        ps.new_plot('val rate value',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_value")
        ps.new_plot('val rate loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_rate_loss")
        ps.new_plot('val total loss',
                    1,
                    xlabel="iteration",
                    ylabel="val_total_loss")

    # 如果测试时是600,max_epoch也是600
    if opt.only_init_val and opt.max_epoch <= start_epoch:
        opt.max_epoch = start_epoch + 2

    for epoch in range(start_epoch + 1, opt.max_epoch + 1):

        same_lr_epoch += 1
        # per epoch avg loss meter
        mse_loss_meter.reset()
        if opt.use_imp:
            rate_display_meter.reset()
            rate_loss_meter.reset()
            total_loss_meter.reset()
        else:
            total_loss_meter = mse_loss_meter
        # cur_epoch_loss refresh every epoch
        # vis.refresh_plot('cur epoch train mse loss')
        ps.new_plot("cur epoch train mse loss",
                    opt.print_freq,
                    xlabel="iteration in cur epoch",
                    ylabel="train_mse_loss")
        # progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), ascii=True)
        # progress_bar.set_description('epoch %d/%d, loss = 0.00' % (epoch, opt.max_epoch))

        # Init val
        if (epoch == start_epoch + 1) and opt.init_val:
            print('Init validation ... ')
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, mse_loss, rate_loss, ps)
            else:
                mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Init Val @ Epoch:{epoch}, lr:{lr}, val_mse_loss: {val_mse_loss}'
                    .format(epoch=epoch, lr=lr, val_mse_loss=mse_val_loss))

        if opt.only_init_val:
            print('Only Init Val Over!')
            return

        model.train()

        # if epoch == start_epoch + 1:
        #     print ('Start training, please inspect log file %s!' % logfile_name)
        # mask is the detection bounding box mask
        for idx, data in enumerate(train_dataloader):

            # pdb.set_trace()

            data = Variable(data)
            # mask = Variable(mask)
            # o_mask = Variable(o_mask, requires_grad=False)

            if opt.use_gpu:
                data = data.cuda(async=True)
                # mask = mask.cuda(async = True)
                # o_mask = o_mask.cuda(async = True)

            # pdb.set_trace()

            optimizer.zero_grad()
            if opt.use_imp:
                reconstructed, imp_mask_sigmoid = model(data)
            else:
                reconstructed = model(data)

            # print ('imp_mask_height', model.imp_mask_height)
            # pdb.set_trace()

            # print ('type recons', type(reconstructed.data))

            loss = mse_loss(reconstructed, data)
            # loss = mse_loss(reconstructed, data)
            caffe_loss = loss / (2 * opt.batch_size)

            if opt.use_imp:
                rate_loss_ = rate_loss(imp_mask_sigmoid)
                # rate_loss_ = yolo_rate_loss(imp_mask_sigmoid, mask)
                total_loss = caffe_loss + rate_loss_
            else:
                total_loss = caffe_loss

            total_loss.backward()
            optimizer.step()

            mse_loss_meter.add(caffe_loss.data[0])

            if opt.use_imp:
                rate_loss_meter.add(rate_loss_.data[0])
                rate_display_meter.add(imp_mask_sigmoid.data.mean())
                total_loss_meter.add(total_loss.data[0])

            if idx % opt.print_freq == opt.print_freq - 1:
                ps.add_point(
                    'train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                ps.add_point(
                    'cur epoch train mse loss',
                    mse_loss_meter.value()[0]
                    if opt.print_smooth else caffe_loss.data[0])
                if opt.use_imp:
                    ps.add_point(
                        'train rate value',
                        rate_display_meter.value()[0]
                        if opt.print_smooth else imp_mask_sigmoid.data.mean())
                    ps.add_point(
                        'train rate loss',
                        rate_loss_meter.value()[0]
                        if opt.print_smooth else rate_loss_.data[0])
                    ps.add_point(
                        'train total loss',
                        total_loss_meter.value()[0]
                        if opt.print_smooth else total_loss.data[0])

                if not opt.use_imp:
                    ps.log('Epoch %d/%d, Iter %d/%d, loss = %.2f, lr = %.8f' %
                           (epoch, opt.max_epoch, idx, len(train_dataloader),
                            total_loss_meter.value()[0], lr))
                else:
                    ps.log(
                        'Epoch %d/%d, Iter %d/%d, loss = %.2f, mse_loss = %.2f, rate_loss = %.2f, rate_display = %.2f, lr = %.8f'
                        %
                        (epoch, opt.max_epoch, idx, len(train_dataloader),
                         total_loss_meter.value()[0],
                         mse_loss_meter.value()[0], rate_loss_meter.value()[0],
                         rate_display_meter.value()[0], lr))

                # 进入debug模式
                if os.path.exists(opt.debug_file):
                    pdb.set_trace()

        if epoch % opt.save_interval == 0:
            print('Save checkpoint file of epoch %d.' % epoch)
            if use_data_parallel == 1:
                model.module.save(optimizer, epoch)
            else:
                model.save(optimizer, epoch)

        ps.make_plot('train mse loss')
        ps.make_plot('cur epoch train mse loss', epoch)
        if opt.use_imp:
            ps.make_plot("train rate value")
            ps.make_plot("train rate loss")
            ps.make_plot("train total loss")

        if epoch % opt.eval_interval == 0:
            print('Validating ...')
            # val
            if opt.use_imp:
                mse_val_loss, rate_val_loss, total_val_loss, rate_val_display = val(
                    model, val_dataloader, mse_loss, rate_loss, ps)
            else:
                mse_val_loss = val(model, val_dataloader, mse_loss, None, ps)

            ps.add_point('val mse loss', mse_val_loss)
            if opt.use_imp:
                ps.add_point('val rate value', rate_val_display)
                ps.add_point('val rate loss', rate_val_loss)
                ps.add_point('val total loss', total_val_loss)

            ps.make_plot('val mse loss')

            if opt.use_imp:
                ps.make_plot('val rate value')
                ps.make_plot('val rate loss')
                ps.make_plot('val total loss')

            # log sth.
            if opt.use_imp:
                ps.log(
                    'Epoch:{epoch}, lr:{lr}, train_mse_loss: {train_mse_loss}, train_rate_loss: {train_rate_loss}, train_total_loss: {train_total_loss}, train_rate_display: {train_rate_display} \n\
    val_mse_loss: {val_mse_loss}, val_rate_loss: {val_rate_loss}, val_total_loss: {val_total_loss}, val_rate_display: {val_rate_display} '
                    .format(epoch=epoch,
                            lr=lr,
                            train_mse_loss=mse_loss_meter.value()[0],
                            train_rate_loss=rate_loss_meter.value()[0],
                            train_total_loss=total_loss_meter.value()[0],
                            train_rate_display=rate_display_meter.value()[0],
                            val_mse_loss=mse_val_loss,
                            val_rate_loss=rate_val_loss,
                            val_total_loss=total_val_loss,
                            val_rate_display=rate_val_display))
            else:
                ps.log(
                    'Epoch:{epoch}, lr:{lr}, train_mse_loss:{train_mse_loss}, val_mse_loss:{val_mse_loss}'
                    .format(epoch=epoch,
                            lr=lr,
                            train_mse_loss=mse_loss_meter.value()[0],
                            val_mse_loss=mse_val_loss))

        # Adaptive adjust lr
        # 每个lr,如果有opt.tolerant_max次比上次的val_loss还高,
        # update learning rate
        # if loss_meter.value()[0] > previous_loss:
        if opt.use_early_adjust:
            if total_loss_meter.value()[0] > previous_loss:
                tolerant_now += 1
                if tolerant_now == opt.tolerant_max:
                    tolerant_now = 0
                    same_lr_epoch = 0
                    lr = lr * opt.lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Due to early stop anneal lr to %.10f at epoch %d' %
                          (lr, epoch))
                    ps.log('Due to early stop anneal lr to %.10f at epoch %d' %
                           (lr, epoch))

            else:
                # tolerant_now -= 1
                tolerant_now = 0

        if epoch % opt.lr_anneal_epochs == 0:
            # if same_lr_epoch and same_lr_epoch % opt.lr_anneal_epochs == 0:
            same_lr_epoch = 0
            tolerant_now = 0
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            print('Anneal lr to %.10f at epoch %d due to full epochs.' %
                  (lr, epoch))
            ps.log('Anneal lr to %.10f at epoch %d due to full epochs.' %
                   (lr, epoch))

        if opt.use_file_decay_lr and os.path.exists(opt.lr_decay_file):
            cur_mtime = os.path.getmtime(opt.lr_decay_file)
            if cur_mtime > decay_file_create_time:
                decay_file_create_time = cur_mtime
                lr = lr * opt.lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                print(
                    'Anneal lr to %.10f at epoch %d due to decay-file indicator.'
                    % (lr, epoch))
                ps.log(
                    'Anneal lr to %.10f at epoch %d due to decay-file indicator.'
                    % (lr, epoch))

        previous_loss = total_loss_meter.value()[0]
    if opt.use_batch_process:
        batch_model_id += 1
        train(kwargs)