Ejemplo n.º 1
0
def train_kd(model, teacher_model, optimizer, loss_fn_kd, T, alpah):

    # set student model to training mode
    model.train()
    teacher_model.eval()

    lr = cfg.LR

    batch_size = cfg.BATCH_SIZE
    #每一个epoch含有多少个batch
    max_batch = len(train_datasets) // batch_size
    epoch_size = len(train_datasets) // batch_size
    ## 训练max_epoch个epoch
    max_iter = cfg.MAX_EPOCH * epoch_size

    start_iter = cfg.RESUME_EPOCH * epoch_size

    epoch = cfg.RESUME_EPOCH

    # cosine学习率调整
    warmup_epoch = 5
    warmup_steps = warmup_epoch * epoch_size
    global_step = 0

    # step 学习率调整参数
    stepvalues = (10 * epoch_size, 20 * epoch_size, 30 * epoch_size)
    step_index = 0

    for iteration in range(start_iter, max_iter):
        global_step += 1

        ##更新迭代器
        if iteration % epoch_size == 0:
            # create batch iterator
            batch_iterator = iter(train_dataloader)
            loss = 0
            epoch += 1
            ###保存模型
            if epoch % 5 == 0 and epoch > 0:
                if cfg.GPUS > 1:
                    checkpoint = {
                        'model': model.module,
                        'model_state_dict': model.module.state_dict(),
                        # 'optimizer_state_dict': optimizer.state_dict(),
                        'epoch': epoch
                    }
                    torch.save(
                        checkpoint,
                        os.path.join(save_folder,
                                     'epoch_{}.pth'.format(epoch)))
                else:
                    checkpoint = {
                        'model': model,
                        'model_state_dict': model.state_dict(),
                        # 'optimizer_state_dict': optimizer.state_dict(),
                        'epoch': epoch
                    }
                    torch.save(
                        checkpoint,
                        os.path.join(save_folder,
                                     'epoch_{}.pth'.format(epoch)))

        if iteration in stepvalues:
            step_index += 1
        lr = adjust_learning_rate_step(optimizer, cfg.LR, 0.1, epoch,
                                       step_index, iteration, epoch_size)

        ## 调整学习率
        # lr = adjust_learning_rate_cosine(optimizer, global_step=global_step,
        #                           learning_rate_base=cfg.LR,
        #                           total_steps=max_iter,
        #                           warmup_steps=warmup_steps)

        ## 获取image 和 label
        # try:
        images, labels = next(batch_iterator)
        # except:
        #     continue

        ##在pytorch0.4之后将Variable 与tensor进行合并,所以这里不需要进行Variable封装
        if torch.cuda.is_available():
            images, labels = images.cuda(), labels.cuda()
        teacher_outputs = teacher_model(images)
        out = model(images)
        loss = loss_fn_kd(out, labels, teacher_outputs, T, alpha)

        optimizer.zero_grad()  # 清空梯度信息,否则在每次进行反向传播时都会累加
        loss.backward()  # loss反向传播
        optimizer.step()  ##梯度更新

        prediction = torch.max(out, 1)[1]
        train_correct = (prediction == labels).sum()
        ##这里得到的train_correct是一个longtensor型,需要转换为float
        # print(train_correct.type())
        train_acc = (train_correct.float()) / batch_size

        if iteration % 10 == 0:
            print('Epoch:' + repr(epoch) + ' || epochiter: ' +
                  repr(iteration % epoch_size) + '/' + repr(epoch_size) +
                  '|| Totel iter ' + repr(iteration) + ' || Loss: %.6f||' %
                  (loss.item()) + 'ACC: %.3f ||' % (train_acc * 100) +
                  'LR: %.8f' % (lr))
Ejemplo n.º 2
0
                    os.path.join(save_folder, 'epoch_{}.pth'.format(epoch)))
            else:
                checkpoint = {
                    'model': model,
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': optimizer.state_dict(),
                    'epoch': epoch
                }
                torch.save(
                    checkpoint,
                    os.path.join(save_folder, 'epoch_{}.pth'.format(epoch)))

    if iteration in stepvalues:
        step_index += 1
    # 学习率
    lr = adjust_learning_rate_step(optimizer, cfg.LR, 0.1, epoch, step_index,
                                   iteration, epoch_size)

    # 调整学习率
    # lr = adjust_learning_rate_cosine(optimizer, global_step=global_step,
    #                           learning_rate_base=cfg.LR,
    #                           total_steps=max_iter,
    #                           warmup_steps=warmup_steps)

    # 获取image 和 label
    # try:
    images, labels = next(batch_iterator)
    # except:
    #     continue

    # 在pytorch0.4之后将Variable 与tensor进行合并,所以这里不需要进行Variable封装
    if torch.cuda.is_available():
Ejemplo n.º 3
0
def train():
    device_ids = [0]
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("CUDA visible devices: " + str(torch.cuda.device_count()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name(device)))

    # Initialize loss and model
    loss = ms_Loss().to(device)
    net = AWNet(4, 3, block=[3, 3, 3, 4, 4]).to(device)
    net = nn.DataParallel(net, device_ids=device_ids)
    new_lr = trainConfig.learning_rate[0]

    # Reload
    if trainConfig.pretrain == True:
        net.load_state_dict(
            torch.load(
                '{}/best_4channel.pkl'.format(trainConfig.save_best),
                map_location=device)["model_state"])
        print('weight loaded.')
    else:
        print('no weight loaded.')
    pytorch_total_params = sum(
        p.numel() for p in net.parameters() if p.requires_grad)
    print("Total_params: {}".format(pytorch_total_params))

    # optimizer and scheduler
    optimizer = torch.optim.Adam(
        net.parameters(), lr=new_lr, betas=(0.9, 0.999))

    # Dataloaders
    train_dataset = LoadData(
        trainConfig.data_dir, TRAIN_SIZE, dslr_scale=1, test=False)
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=trainConfig.batch_size,
        shuffle=True,
        num_workers=32,
        pin_memory=True,
        drop_last=True)

    test_dataset = LoadData(
        trainConfig.data_dir, TEST_SIZE, dslr_scale=1, test=True)
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=18,
        pin_memory=True,
        drop_last=False)

    print('Train loader length: {}'.format(len(train_loader)))

    pre_psnr, pre_ssim = validation(net, test_loader, device, save_tag=True)
    print('previous PSNR: {:.4f}, previous ssim: {:.4f}'.format(
        pre_psnr, pre_ssim))
    iteration = 0
    for epoch in range(trainConfig.epoch):
        psnr_list = []
        start_time = time.time()
        if epoch > 0:
            new_lr = adjust_learning_rate_step(
                optimizer, epoch, trainConfig.epoch, trainConfig.learning_rate)
        for batch_id, data in enumerate(train_loader):
            x, target, _ = data
            x = x.to(device)
            target = target.to(device)
            pred, _ = net(x)

            optimizer.zero_grad()

            total_loss, losses = loss(pred, target)
            total_loss.backward()
            optimizer.step()

            iteration += 1
            if trainConfig.print_loss:
                print("epoch:{}/{} | Loss: {:.4f} ".format(
                    epoch, trainConfig.epoch, total_loss.item()))
            if not (batch_id % 1000):
                print('Epoch:{0}, Iteration:{1}'.format(epoch, batch_id))

            psnr_list.extend(to_psnr(pred[0], target))

        train_psnr = sum(psnr_list) / len(psnr_list)
        state = {
            "model_state": net.state_dict(),
            "lr": new_lr,
        }
        print('saved checkpoint')
        torch.save(state, '{}/four_channel_epoch_{}.pkl'.format(
            trainConfig.checkpoints, epoch))

        one_epoch_time = time.time() - start_time
        print('time: {}, train psnr: {}'.format(one_epoch_time, train_psnr))
        val_psnr, val_ssim = validation(
            net, test_loader, device, save_tag=True)
        print_log(epoch + 1, trainConfig.epoch, one_epoch_time, train_psnr,
                  val_psnr, val_ssim, 'multi_loss')

        if val_psnr >= pre_psnr:
            state = {
                "model_state": net.state_dict(),
                "lr": new_lr,
            }

            print('saved best weight')
            torch.save(state, '{}/best_4channel.pkl'.format(
                trainConfig.save_best))
            pre_psnr = val_psnr