def start_train(): ''' 训练 ''' use_amp = True # 前向反传N次,再更新参数 目的:增大batch(理论batch= batch_size * N) iter_size = 8 myNet = MyNet(use_amp).to("cuda:0") myNet = torch.nn.DataParallel(myNet, device_ids=[0, 1]) # 数据并行 myNet.train() # 训练开始前初始化 梯度缩放器 scaler = GradScaler() if use_amp else None # 加载预训练权重 if resume_train: scaler.load_state_dict(checkpoint['scaler']) # amp自动混合精度用到 optimizer.load_state_dict(checkpoint['optimizer']) myNet.load_state_dict(checkpoint["model"]) for epoch in range(1, 100): for batch_idx, (input, target) in enumerate(dataloader_train): # 数据 转到每个并行模型的主卡上 input = input.to("cuda:0") target = target.to("cuda:0") # 自动混合精度训练 if use_amp: # 自动广播 将支持半精度操作自动转为FP16 with autocast(): # 提取特征 feature = myNet(input) losses = loss_function(target, feature) loss = losses / iter_size scaler.scale(loss).backward() else: feature = myNet(input, target) losses = loss_function(target, feature) loss = losses / iter_size loss.backward() # 梯度累积,再更新参数 if (batch_idx + 1) % iter_size == 0: # 梯度更新 if use_amp: scaler.step(optimizer) scaler.update() else: optimizer.step() # 梯度清零 optimizer.zero_grad() # scaler 具有状态。恢复训练时需要加载 state = { 'net': myNet.state_dict(), 'optimizer': optimizer.state_dict(), 'scaler': scaler.state_dict() } torch.save(state, "filename.pth")
inds_sim = inds_sim.cuda() inds_scr = inds_scr.cuda() target_scr = target_scr.cuda() target_scr = Variable(target_scr) # set minLabels args.minLabels = len(mask_inds) # train model = MyNet(profile["count"], args.nChannel, args.nConv) if use_cuda: model.cuda() model.train() # similarity loss definition loss_fn = torch.nn.CrossEntropyLoss() # scribble loss definition loss_fn_scr = torch.nn.CrossEntropyLoss() # continuity loss definition loss_hpy = torch.nn.L1Loss(size_average=True) loss_hpz = torch.nn.L1Loss(size_average=True) HPy_target = torch.zeros(im.shape[1] - 1, im.shape[2], args.nChannel) HPz_target = torch.zeros(im.shape[1], im.shape[2] - 1, args.nChannel) if use_cuda:
def do_train(data_path, model_name='mymodel', use_gpu=False, epoch_num=5, batch_size=100, learning_rate=0.01): place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): model = MyNet() model.train() train_loader = load_data(data_path, mode='train') optimizer = fluid.optimizer.SGDOptimizer( learning_rate=learning_rate, parameter_list=model.parameters()) iter = 0 for epoch_id in range(epoch_num): for batch_id, data in enumerate(train_loader()): #准备数据,格式需要转换成符合框架要求的 image_data, label_data = data # 将数据转为飞桨动态图格式 image = fluid.dygraph.to_variable(image_data) label = fluid.dygraph.to_variable(label_data) # #前向计算的过程 # predict = model(image) #前向计算的过程,同时拿到模型输出值和分类准确率 predict, avg_acc = model(image, label) #计算损失,取一个批次样本损失的平均值 # loss = fluid.layers.square_error_cost(predict, label) loss = fluid.layers.cross_entropy(predict, label) avg_loss = fluid.layers.mean(loss) #每训练了1000批次的数据,打印下当前Loss的情况 if batch_id != 0 and batch_id % 100 == 0: print( "epoch: {}, batch: {}, loss is: {}, acc is: {}".format( epoch_id, batch_id, avg_loss.numpy(), avg_acc.numpy())) log_writer.add_scalar(tag='acc', step=iter, value=avg_acc.numpy()) log_writer.add_scalar(tag='loss', step=iter, value=avg_loss.numpy()) iter = iter + 100 #后向传播,更新参数的过程 avg_loss.backward() optimizer.minimize(avg_loss) model.clear_gradients() fluid.save_dygraph( model.state_dict(), os.path.join(CHECKPOINT_PATH, f'{model_name}_epoch_{epoch_id}')) fluid.save_dygraph( optimizer.state_dict(), os.path.join(CHECKPOINT_PATH, f'{model_name}_epoch_{epoch_id}')) # 保存模型 fluid.save_dygraph(model.state_dict(), os.path.join(MODEL_PATH, model_name))