def train_kd(model, teacher_model, optimizer, loss_fn_kd, T, alpah): # set student model to training mode model.train() teacher_model.eval() lr = cfg.LR batch_size = cfg.BATCH_SIZE #每一个epoch含有多少个batch max_batch = len(train_datasets) // batch_size epoch_size = len(train_datasets) // batch_size ## 训练max_epoch个epoch max_iter = cfg.MAX_EPOCH * epoch_size start_iter = cfg.RESUME_EPOCH * epoch_size epoch = cfg.RESUME_EPOCH # cosine学习率调整 warmup_epoch = 5 warmup_steps = warmup_epoch * epoch_size global_step = 0 # step 学习率调整参数 stepvalues = (10 * epoch_size, 20 * epoch_size, 30 * epoch_size) step_index = 0 for iteration in range(start_iter, max_iter): global_step += 1 ##更新迭代器 if iteration % epoch_size == 0: # create batch iterator batch_iterator = iter(train_dataloader) loss = 0 epoch += 1 ###保存模型 if epoch % 5 == 0 and epoch > 0: if cfg.GPUS > 1: checkpoint = { 'model': model.module, 'model_state_dict': model.module.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch } torch.save( checkpoint, os.path.join(save_folder, 'epoch_{}.pth'.format(epoch))) else: checkpoint = { 'model': model, 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch } torch.save( checkpoint, os.path.join(save_folder, 'epoch_{}.pth'.format(epoch))) if iteration in stepvalues: step_index += 1 lr = adjust_learning_rate_step(optimizer, cfg.LR, 0.1, epoch, step_index, iteration, epoch_size) ## 调整学习率 # lr = adjust_learning_rate_cosine(optimizer, global_step=global_step, # learning_rate_base=cfg.LR, # total_steps=max_iter, # warmup_steps=warmup_steps) ## 获取image 和 label # try: images, labels = next(batch_iterator) # except: # continue ##在pytorch0.4之后将Variable 与tensor进行合并,所以这里不需要进行Variable封装 if torch.cuda.is_available(): images, labels = images.cuda(), labels.cuda() teacher_outputs = teacher_model(images) out = model(images) loss = loss_fn_kd(out, labels, teacher_outputs, T, alpha) optimizer.zero_grad() # 清空梯度信息,否则在每次进行反向传播时都会累加 loss.backward() # loss反向传播 optimizer.step() ##梯度更新 prediction = torch.max(out, 1)[1] train_correct = (prediction == labels).sum() ##这里得到的train_correct是一个longtensor型,需要转换为float # print(train_correct.type()) train_acc = (train_correct.float()) / batch_size if iteration % 10 == 0: print('Epoch:' + repr(epoch) + ' || epochiter: ' + repr(iteration % epoch_size) + '/' + repr(epoch_size) + '|| Totel iter ' + repr(iteration) + ' || Loss: %.6f||' % (loss.item()) + 'ACC: %.3f ||' % (train_acc * 100) + 'LR: %.8f' % (lr))
os.path.join(save_folder, 'epoch_{}.pth'.format(epoch))) else: checkpoint = { 'model': model, 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch } torch.save( checkpoint, os.path.join(save_folder, 'epoch_{}.pth'.format(epoch))) if iteration in stepvalues: step_index += 1 # 学习率 lr = adjust_learning_rate_step(optimizer, cfg.LR, 0.1, epoch, step_index, iteration, epoch_size) # 调整学习率 # lr = adjust_learning_rate_cosine(optimizer, global_step=global_step, # learning_rate_base=cfg.LR, # total_steps=max_iter, # warmup_steps=warmup_steps) # 获取image 和 label # try: images, labels = next(batch_iterator) # except: # continue # 在pytorch0.4之后将Variable 与tensor进行合并,所以这里不需要进行Variable封装 if torch.cuda.is_available():
def train(): device_ids = [0] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("CUDA visible devices: " + str(torch.cuda.device_count())) print("CUDA Device Name: " + str(torch.cuda.get_device_name(device))) # Initialize loss and model loss = ms_Loss().to(device) net = AWNet(4, 3, block=[3, 3, 3, 4, 4]).to(device) net = nn.DataParallel(net, device_ids=device_ids) new_lr = trainConfig.learning_rate[0] # Reload if trainConfig.pretrain == True: net.load_state_dict( torch.load( '{}/best_4channel.pkl'.format(trainConfig.save_best), map_location=device)["model_state"]) print('weight loaded.') else: print('no weight loaded.') pytorch_total_params = sum( p.numel() for p in net.parameters() if p.requires_grad) print("Total_params: {}".format(pytorch_total_params)) # optimizer and scheduler optimizer = torch.optim.Adam( net.parameters(), lr=new_lr, betas=(0.9, 0.999)) # Dataloaders train_dataset = LoadData( trainConfig.data_dir, TRAIN_SIZE, dslr_scale=1, test=False) train_loader = DataLoader( dataset=train_dataset, batch_size=trainConfig.batch_size, shuffle=True, num_workers=32, pin_memory=True, drop_last=True) test_dataset = LoadData( trainConfig.data_dir, TEST_SIZE, dslr_scale=1, test=True) test_loader = DataLoader( dataset=test_dataset, batch_size=8, shuffle=False, num_workers=18, pin_memory=True, drop_last=False) print('Train loader length: {}'.format(len(train_loader))) pre_psnr, pre_ssim = validation(net, test_loader, device, save_tag=True) print('previous PSNR: {:.4f}, previous ssim: {:.4f}'.format( pre_psnr, pre_ssim)) iteration = 0 for epoch in range(trainConfig.epoch): psnr_list = [] start_time = time.time() if epoch > 0: new_lr = adjust_learning_rate_step( optimizer, epoch, trainConfig.epoch, trainConfig.learning_rate) for batch_id, data in enumerate(train_loader): x, target, _ = data x = x.to(device) target = target.to(device) pred, _ = net(x) optimizer.zero_grad() total_loss, losses = loss(pred, target) total_loss.backward() optimizer.step() iteration += 1 if trainConfig.print_loss: print("epoch:{}/{} | Loss: {:.4f} ".format( epoch, trainConfig.epoch, total_loss.item())) if not (batch_id % 1000): print('Epoch:{0}, Iteration:{1}'.format(epoch, batch_id)) psnr_list.extend(to_psnr(pred[0], target)) train_psnr = sum(psnr_list) / len(psnr_list) state = { "model_state": net.state_dict(), "lr": new_lr, } print('saved checkpoint') torch.save(state, '{}/four_channel_epoch_{}.pkl'.format( trainConfig.checkpoints, epoch)) one_epoch_time = time.time() - start_time print('time: {}, train psnr: {}'.format(one_epoch_time, train_psnr)) val_psnr, val_ssim = validation( net, test_loader, device, save_tag=True) print_log(epoch + 1, trainConfig.epoch, one_epoch_time, train_psnr, val_psnr, val_ssim, 'multi_loss') if val_psnr >= pre_psnr: state = { "model_state": net.state_dict(), "lr": new_lr, } print('saved best weight') torch.save(state, '{}/best_4channel.pkl'.format( trainConfig.save_best)) pre_psnr = val_psnr