def train(train_loader, model, criterion, optimizer, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() for i, (inps, tars) in enumerate(tqdm(train_loader)): glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) #make prediction with model inputs = Variable(inps.cuda()) targets = Variable(tars.cuda(non_blocking=True)) outputs = model(inputs) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm(model.parameters(), max_norm=1) optimizer.step() return glob_step, lr_now, losses.avg
def train( train_loader, model, criterion, optimizer, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True, ): losses = utils.AverageMeter() model.train() start = time.time() batch_time = 0 bar = Bar(">>>", fill=">", max=len(train_loader)) for i, (inps, tars) in enumerate(train_loader): glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) inputs = Variable(inps.cuda()) targets = Variable(tars.cuda()) outputs = model(inputs) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm(model.parameters(), max_norm=1) optimizer.step() # update summary if (i + 1) % 100 == 0: batch_time = time.time() - start start = time.time() bar.suffix = "({batch}/{size}) | batch: {batchtime:.4}ms | Total: {ttl} | ETA: {eta:} | loss: {loss:.4f}".format( batch=i + 1, size=len(train_loader), batchtime=batch_time * 10.0, ttl=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, ) bar.next() bar.finish() return glob_step, lr_now, losses.avg
def train(train_loader, model, criterion, optimizer, joint_num, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() start = time.time() batch_time = 0 bar = Bar('>>>', fill='>', max=len(train_loader)) for i, data in enumerate(train_loader): # Turn down Learning Rate glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) joint2d, truth = data['joint2d'], data['truth'] inputs=Variable(joint2d.cuda().type(torch.cuda.FloatTensor)) targets=Variable(truth.cuda().type(torch.cuda.FloatTensor)) outputs = model(inputs) outputs=torch.reshape(outputs,(-1,(joint_num)*3)) targets=torch.reshape(targets,(-1,(joint_num)*3)) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) optimizer.step() # update summary if (i + 1) % 100 == 0: batch_time = time.time() - start start = time.time() bar.suffix = '({batch}/{size}) | batch: {batchtime:.4}ms | Total: {ttl} | ETA: {eta:} | loss: {loss:.4f}' \ .format(batch=i + 1, size=len(train_loader), batchtime=batch_time * 10.0, ttl=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg) bar.next() bar.finish() return glob_step, lr_now, losses.avg
def train(train_loader, model, criterion, optimizer, num_kpts=15, num_classes=200, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() errs, accs = [], [] start = time.time() batch_time = 0 bar = Bar('>>>', fill='>', max=len(train_loader)) for i, sample in enumerate(train_loader): glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) inputs = sample['X'].cuda() # NOTE: PyTorch issue with dim0=1. if inputs.shape[0] == 1: continue targets = sample['Y'].reshape(-1).cuda() outputs = model(inputs) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm(model.parameters(), max_norm=1) optimizer.step() # Set outputs to [0, 1]. softmax = nn.Softmax() outputs = softmax(outputs) outputs = outputs.data.cpu().numpy() targets = one_hot(targets.data.cpu().numpy(), num_classes) errs.append(np.mean(np.abs(outputs - targets))) accs.append( metrics.accuracy_score(np.argmax(targets, axis=1), np.argmax(outputs, axis=1))) # update summary if (i + 1) % 100 == 0: batch_time = time.time() - start start = time.time() bar.suffix = '({batch}/{size}) | batch: {batchtime:.4}ms | Total: {ttl} | ETA: {eta:} | loss: {loss:.6f}' \ .format(batch=i + 1, size=len(train_loader), batchtime=batch_time * 10.0, ttl=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg) bar.next() bar.finish() err = np.mean(np.array(errs, dtype=np.float32)) acc = np.mean(np.array(accs, dtype=np.float32)) print(">>> train error: {} <<<".format(err)) print(">>> train accuracy: {} <<<".format(acc)) return glob_step, lr_now, losses.avg, err, acc
def train(train_loader, model, criterion, optimizer, stat_2d, stat_3d, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() # for i, (inps, tars) in enumerate(train_loader): # inps = (64, 32) pbar = tqdm(train_loader) for i, (inps, tars) in enumerate(pbar): # inps = (64, 32) glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) ### Input unnormalization inputs_unnorm = data_process.unNormalizeData( inps.data.cpu().numpy(), stat_2d['mean'], stat_2d['std'], stat_2d['dim_use']) # 64, 64 dim_2d_use = stat_2d['dim_use'] inputs_use = inputs_unnorm[:, dim_2d_use] # (64, 32) ### Input distance normalization inputs_dist_norm, _ = data_process.input_norm( inputs_use) # (64, 32) , array input_dist = torch.tensor(inputs_dist_norm, dtype=torch.float32) ### Targets unnormalization targets_unnorm = data_process.unNormalizeData( tars.data.cpu().numpy(), stat_3d['mean'], stat_3d['std'], stat_3d['dim_use']) # (64, 96) dim_3d_use = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 36, 37, 38, 39, 40, 41, 45, 46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 75, 76, 77, 78, 79, 80, 81, 82, 83 ]) targets_use = targets_unnorm[:, dim_3d_use] # (51, ) ### Targets distance normalization targets_dist_norm, _ = data_process.output_norm(targets_use) targets_dist = torch.tensor(targets_dist_norm, dtype=torch.float32) inputs = Variable(input_dist.cuda()) targets = Variable(targets_dist.cuda(async=True)) outputs = model(inputs) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) optimizer.step() # tqdm.set_postfix(loss='{:05.6f}'.format(losses.avg)) pbar.set_postfix(tr_loss='{:05.6f}'.format(losses.avg)) return glob_step, lr_now, losses.avg
def train_model(model, dataloaders, criterion, optimizer, cmd, writer, is_inception=False, model_save_path="./"): print("-------------------sparse training-----------------------") num_epochs = opt.epoch log_dir = os.path.join(model_save_path, opt.expID) os.makedirs(log_dir, exist_ok=True) log_save_path = os.path.join(log_dir, "log.txt") since = time.time() best_weight = copy.deepcopy(model) val_acc_history, train_acc_history, val_loss_history, train_loss_history = [], [], [], [] train_acc, val_acc, train_loss, val_loss, best_epoch, epoch_acc, epoch = 0, 0, float( "inf"), float("inf"), 0, 0, 0 epoch_ls = [] early_stopping = EarlyStopping(patience=opt.patience, verbose=True) # prune_idx,ignore_id,all_conv = parse_module_defs(model) # print(prune_idx) decay, decay_epoch = 0, [] stop = False log_writer = open(log_save_path, "w") log_writer.write(cmd) log_writer.write("\n") lr = opt.LR train_log_name = log_save_path.replace("log.txt", "train_log.csv") train_log = open(train_log_name, "w", newline="") csv_writer = csv.writer(train_log) csv_writer.writerow(write_csv_title()) os.makedirs("result", exist_ok=True) result = os.path.join("result", "{}_result_{}.csv".format(opt.expFolder, computer)) exist = os.path.exists(result) print( "----------------------------------------------------------------------------------------------------" ) print(opt) print("Training backbone is: {}".format(opt.backbone)) print("Warm up end at {}".format(warm_up_epoch)) for k, v in config.bad_epochs.items(): if v > 1: raise ValueError("Wrong stopping accuracy!") print( "----------------------------------------------------------------------------------------------------" ) utils.draw_graph(epoch_ls, train_loss_history, val_loss_history, train_acc_history, val_acc_history, log_dir) flops = utils.print_model_param_flops(model) print("FLOPs of current model is {}".format(flops)) params = utils.print_model_param_nums(model) print("Parameters of current model is {}".format(params)) inf_time = utils.get_inference_time(model, height=input_size, width=input_size) print("Inference time is {}".format(inf_time)) print( "----------------------------------------------------------------------------------------------------" ) for epoch in range(num_epochs): log_tmp = [opt.expID, epoch] if epoch < warm_up_epoch: optimizer, lr = warm_up_lr(optimizer, epoch) elif epoch == warm_up_epoch: lr = opt.LR elif epoch > num_epochs * 0.7 and epoch < num_epochs * 0.9: optimizer, lr = lr_decay(optimizer, lr) elif epoch > num_epochs * 0.9: optimizer, lr = lr_decay2(optimizer, lr) log_tmp.append(lr) log_tmp.append("") epoch_start_time = time.time() print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 20) log_writer.write('Epoch {}/{}\n'.format(epoch, num_epochs - 1)) log_writer.write('-' * 10 + "\n") writer.add_scalar("lr", lr, epoch) print("Current lr is {}".format(lr)) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().data.to("cpu").numpy(), epoch) # Each epoch has a training and validation phase for phase in ['train', 'val']: cls_correct = [0] * class_nums cls_sum = [0] * class_nums cls_acc = [0] * class_nums print(phase) if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 batch_num = 0 batch_start_time = time.time() for names, inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # optimizer, lr = utils.adjust_lr(optimizer, epoch, opt.epoch) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): if is_inception and phase == 'train': outputs, aux_outputs = model(inputs) loss1 = criterion(outputs, labels) loss2 = criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: outputs = model(inputs) loss = criterion(outputs, labels) _, preds = torch.max(outputs, 1) right = preds == labels for r, l in zip(right, labels): cls_sum[l] += 1 if r: cls_correct[l] += 1 if phase == 'train': if opt.mix_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # sr_flag = True # BNOptimizer.updateBN(sr_flag, model, s, prune_idx) BNOptimizer.updateBN(model, opt.sparse_s) optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) if batch_num % 100 == 0: print("batch num:", batch_num, "cost time:", time.time() - batch_start_time) batch_start_time = time.time() batch_num += 1 for idx, (s, c) in enumerate(zip(cls_sum, cls_correct)): cls_acc[idx] = c / s epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len( dataloaders[phase].dataset) bn_sum, bn_num = 0, 0 for mod in model.modules(): if isinstance(mod, nn.BatchNorm2d): bn_num += mod.num_features bn_sum += torch.sum(abs(mod.weight)) writer.add_histogram("bn_weight", mod.weight.data.cpu().numpy(), epoch) bn_ave = bn_sum / bn_num print("Current bn : {} --> {}".format(epoch, bn_ave)) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) log_writer.write('{} Loss: {:.4f} Acc: {:.4f}\n'.format( phase, epoch_loss, epoch_acc)) if phase == 'val': log_tmp.insert(5, epoch_acc.tolist()) log_tmp.insert(6, epoch_loss) val_loss_history.append(epoch_loss) val_acc_history.append(epoch_acc) val_loss = epoch_loss if epoch_loss < val_loss else val_loss writer.add_scalar("scalar/val_acc", epoch_acc, epoch) writer.add_scalar("Scalar/val_loss", epoch_loss, epoch) imgnames, pds = names[:3], [ label_dict[i] for i in preds[:record_num].tolist() ] for idx, (img_path, pd) in enumerate(zip(imgnames, pds)): img = cv2.imread(img_path) img = cv2.putText(img, pd, (20, 50), cv2.FONT_HERSHEY_PLAIN, 2, (0, 255, 0), 2) #cv2.imwrite("tmp/{}_{}.jpg".format(epoch, idx), img) tb_img = utils.image2tensorboard(img) # images = torch.cat((images, torch.unsqueeze(tb_img, 0)), 0) writer.add_image("pred_image_for_epoch{}".format(epoch), tb_img, epoch) if epoch % opt.save_interval == 0 and epoch != 0: torch.save( model.state_dict(), os.path.join( model_save_path, "{}_{}_{}cls_{}.pth".format( opt.expID, opt.backbone, class_nums, epoch))) # writer.add_image("pred_image_for_epoch{}".format(epoch), images[1:, :, :, :]) if epoch_acc > val_acc: torch.save( model.state_dict(), os.path.join( model_save_path, "{}_{}_{}cls_best.pth".format( opt.expID, opt.backbone, class_nums))) val_acc = epoch_acc best_epoch = epoch best_weight = copy.deepcopy(model) else: log_tmp.append(epoch_acc.tolist()) log_tmp.append(epoch_loss) train_acc_history.append(epoch_acc) train_loss_history.append(epoch_loss) train_acc = epoch_acc if epoch_acc > train_acc else train_acc train_loss = epoch_loss if epoch_loss < train_loss else train_loss writer.add_scalar("scalar/train_acc", epoch_acc, epoch) writer.add_scalar("Scalar/train_loss", epoch_loss, epoch) log_tmp += log_of_each_class(cls_acc) epoch_ls.append(epoch) epoch_time_cost = time.time() - epoch_start_time print("epoch complete in {:.0f}m {:.0f}s".format( epoch_time_cost // 60, epoch_time_cost % 60)) log_writer.write("epoch complete in {:.0f}m {:.0f}s\n".format( epoch_time_cost // 60, epoch_time_cost % 60)) torch.save(opt, '{}/option.pth'.format(model_save_path)) csv_writer.writerow(log_tmp) csv_writer.writerow([]) csv_writer.writerow(csv_cls_num(dataloaders)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:.4f}'.format(val_acc)) log_writer.write('Training complete in {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) log_writer.write('Best val Acc: {:.4f}\n'.format(val_acc)) log_writer.close() with open(result, "a+") as f: if not exist: title_str = "id,backbone,params,flops,time,batch_size,optimizer,freeze_bn,freeze,sparse,sparse_decay," \ "epoch_num,LR,weightDecay,loadModel,location, ,folder_name,train_acc,train_loss,val_acc," \ "val_loss,training_time, best_epoch,total_epoch\n" title_str = write_decay_title(len(decay_epoch), title_str) f.write(title_str) info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{}\n".format( opt.expID, opt.backbone, params, flops, inf_time, opt.batch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.epoch, opt.LR, opt.weightDecay, opt.loadModel, computer, os.path.join(opt.expFolder, opt.expID), train_acc, train_loss, val_acc, val_loss, time_elapsed, best_epoch, epoch) info_str = write_decay_info(decay_epoch, info_str) f.write(info_str)
def train_multiposenet(train_loader, model, criterion, optimizer, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): model.train() l2_loss, cvae_loss, gsnn_loss, kl_loss = utils.AverageMeter( ), utils.AverageMeter(), utils.AverageMeter(), utils.AverageMeter() for i, (inps, tars, _) in enumerate(train_loader): glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) # forward pass inputs = Variable(inps.cuda()) targets = Variable(tars.cuda()) out_cvae, out_gsnn, post_mu, post_logvar = model(inputs, targets) # backward pass optimizer.zero_grad() loss_l2, loss_cvae, loss_gsnn, loss_kl = loss_function( out_cvae, out_gsnn, targets, post_mu, post_logvar) loss_l2 = loss_l2 * option.weight_l2 loss_cvae = loss_cvae * option.weight_l2 loss_gsnn = loss_gsnn * option.weight_l2 loss_kl = loss_kl * option.weight_kl l2_loss.update(loss_l2.item(), inputs.size(0)) cvae_loss.update(loss_cvae.item(), inputs.size(0)) gsnn_loss.update(loss_gsnn.item(), inputs.size(0)) kl_loss.update(loss_kl.item(), inputs.size(0)) loss = loss_kl + loss_l2 loss.backward() if max_norm: nn.utils.clip_grad_norm(model.parameters(), max_norm=1) optimizer.step() # update summary if (i % 100 == 0): print('({batch}/{size}) | loss l2: {loss_l2:.4f} | loss cvae: {loss_cvae:.4f} | loss gsnn: {loss_gsnn:.4f} | loss kl: {loss_kl:.4f}' \ .format(batch=i + 1, size=len(train_loader), loss_l2=l2_loss.avg, loss_cvae=cvae_loss.avg, loss_gsnn=gsnn_loss.avg, loss_kl=kl_loss.avg)) sys.stdout.flush() return glob_step, lr_now, l2_loss.avg
def train(train_loader, model, criterion, optimizer, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() start = time.time() batch_time = 0 bar = Bar('>>>', fill='>', max=len(train_loader)) # train_loader = tqdm(train_loader, dynamic_ncols=True) # for i, (inps, tars) in enumerate(train_loader): glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) inputs = Variable(inps.cuda()) targets = Variable(tars.cuda(async=True)) outputs, outputs_inputs = model(inputs) # outputs = model(inputs) # calculate loss optimizer.zero_grad() # ########### # alpha = 0.0 # loss1 = criterion(outputs[0], targets) # loss2 = criterion(outputs[1], targets) # loss = alpha*loss1 + (1.0-alpha)*loss2 # ######## loss = criterion(outputs, targets) loss_input = criterion(outputs_inputs, inputs) loss = loss + loss_input losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm(model.parameters(), max_norm=1) optimizer.step() # update summary if (i + 1) % 100 == 0: batch_time = time.time() - start start = time.time() # bar.suffix = '({batch}/{size}) | batch: {batchtime:.4}ms | Total: {ttl} | ETA: {eta:} | loss: {loss:.4f}' \ # .format(batch=i + 1, # size=len(train_loader), # batchtime=batch_time * 10.0, # ttl=bar.elapsed_td, # eta=bar.eta_td, # loss=losses.avg) # bar.next() # train_loader.set_description( '({batch}/{size}) | batch: {batchtime:.4}ms | loss: {loss:.6f}'.format( batch=i + 1, size=len(train_loader), batchtime=batch_time * 10.0, loss=losses.avg) ) train_loader.close() # # bar.finish() return glob_step, lr_now, losses.avg
def train(train_loader, model, criterion, optimizer, stat_2d, stat_3d, lr_init=None, lr_now=None, glob_step=None, lr_decay=None, gamma=None, max_norm=True): losses = utils.AverageMeter() model.train() # start = time.time() # batch_time = 0 # bar = Bar('>>>', fill='>', max=len(train_loader)) # for i, (inps, tars) in enumerate(train_loader): # inps = (64, 32) pbar = tqdm(train_loader) for i, (inps, tars) in enumerate(pbar): # inps = (64, 32) glob_step += 1 if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, gamma) ### Input unnormalization inputs_unnorm = data_process.unNormalizeData( inps.data.cpu().numpy(), stat_2d['mean'], stat_2d['std'], stat_2d['dim_use']) # 64, 64 # unnorm size = 64, make zeros mtrx and just do unnorm, so (0 * stdMat) + meanMat => 64, 64 // junk values the other position except original 16 joints dim_2d_use = stat_2d['dim_use'] # select useful 32 joint using dim_2d-use => 64, 32 inputs_use = inputs_unnorm[:, dim_2d_use] # (64, 32) ### Input distance normalization inputs_dist_norm, _ = data_process.input_norm( inputs_use) # (64, 32) , array input_dist = torch.tensor(inputs_dist_norm, dtype=torch.float32) ### Targets unnormalization targets_unnorm = data_process.unNormalizeData( tars.data.cpu().numpy(), stat_3d['mean'], stat_3d['std'], stat_3d['dim_use']) # (64, 96) dim_3d_use = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 36, 37, 38, 39, 40, 41, 45, 46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 75, 76, 77, 78, 79, 80, 81, 82, 83 ]) targets_use = targets_unnorm[:, dim_3d_use] # (51, ) ### Targets distance normalization targets_dist_norm, _ = data_process.output_norm(targets_use) targets_dist = torch.tensor(targets_dist_norm, dtype=torch.float32) inputs = Variable(input_dist.cuda()) targets = Variable(targets_dist.cuda(async=True)) outputs = model(inputs) # calculate loss optimizer.zero_grad() loss = criterion(outputs, targets) losses.update(loss.item(), inputs.size(0)) loss.backward() if max_norm: nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) optimizer.step() # tqdm.set_postfix(loss='{:05.6f}'.format(losses.avg)) pbar.set_postfix(tr_loss='{:05.6f}'.format(losses.avg)) # # update summary # if (i + 1) % 100 == 0: # batch_time = time.time() - start # start = time.time() # # bar.suffix = '({batch}/{size}) | batch: {batchtime:.4}ms | Total: {ttl} | ETA: {eta:} | loss: {loss:.4f}' \ # .format(batch=i + 1, # size=len(train_loader), # batchtime=batch_time * 10.0, # ttl=bar.elapsed_td, # eta=bar.eta_td, # loss=losses.avg) # bar.next() # # bar.finish() return glob_step, lr_now, losses.avg
def main(opt): err_best = 1000 glob_step = 0 lr_now = opt.lr lr_decay = opt.lr_decay lr_init = opt.lr lr_gamma = opt.lr_gamma start_epoch = 0 file_path = os.path.join(opt.ckpt, 'opt.json') with open(file_path, 'w') as f: f.write(json.dumps(vars(opt), sort_keys=True, indent=4)) # create model print(">>> creating model") model = LinearModel(opt.batch_size, opt.predict_14) # = refine_2d_model(opt.batch_size,opt.predict_14) model = model.cuda() model.apply(weight_init) #refine_2d_model = refine_2d_model.cuda() #refine_2d_model.apply(weight_init) print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0 )) #+ sum(p.numel() for p in refine_2d_model.parameters()) / 1000000.0)) criterion = nn.MSELoss(size_average=True).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) #refine_2d_model_optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) # 加载checkpoint if opt.resume: print(">>> loading ckpt from '{}'".format(opt.load)) ckpt = torch.load(opt.load) start_epoch = ckpt['epoch'] err_best = ckpt['err'] glob_step = ckpt['step'] lr_now = ckpt['lr'] model.load_state_dict(ckpt['state_dict']) #refine_2d_model.load_state_dict[ckpt['refine_state_dict']] optimizer.load_state_dict(ckpt['optimizer']) #refine_2d_model_optimizer.load_state_dict(ckpt['refine_optimizer']) print(">>> ckpt loaded (epoch: {} | err: {})".format( start_epoch, err_best)) # 包含动作的 list actions = data_utils.define_actions(opt.action) num_actions = len(actions) print(">>> actions to use (total: {}):".format(num_actions)) pprint(actions, indent=4) print(">>>") # data loading print(">>> loading data") # Load camera parameters SUBJECT_IDS = [1, 5, 6, 7, 8, 9, 11] rcams = cameras.load_cameras(opt.cameras_path, SUBJECT_IDS) # Load 3d data and load (or create) 2d projections train_set_3d, test_set_3d, data_mean_3d, data_std_3d, dim_to_ignore_3d, dim_to_use_3d, train_root_positions, test_root_positions = data_utils.read_3d_data( actions, opt.data_dir, opt.camera_frame, rcams, opt.predict_14) # Read stacked hourglass 2D predictions if use_sh, otherwise use groundtruth 2D projections if opt.use_hg: train_set_2d, test_set_2d, data_mean_2d, data_std_2d, dim_to_ignore_2d, dim_to_use_2d = data_utils.read_2d_predictions( actions, opt.data_dir) else: train_set_2d, test_set_2d, data_mean_2d, data_std_2d, dim_to_ignore_2d, dim_to_use_2d = data_utils.create_2d_data( actions, opt.data_dir, rcams) #gt_train_set_2d, gt_test_set_2d, gt_data_mean_2d, gt_data_std_2d, gt_dim_to_ignore_2d, gt_dim_to_use_2d = data_utils.create_2d_data( actions, opt.data_dir, rcams ) print("done reading and normalizing data.") step_time, loss = 0, 0 current_epoch = start_epoch log_every_n_batches = 100 cudnn.benchmark = True best_error = 10000 while current_epoch < opt.epochs: current_epoch = current_epoch + 1 # === Load training batches for one epoch === encoder_inputs, decoder_outputs = get_all_batches(opt, train_set_2d, train_set_3d, training=True) nbatches = len(encoder_inputs) print("There are {0} train batches".format(nbatches)) start_time = time.time() # === Loop through all the training batches === current_step = 0 for i in range(nbatches): if (i + 1) % log_every_n_batches == 0: # Print progress every log_every_n_batches batches print("Working on epoch {0}, batch {1} / {2}... \n".format( current_epoch, i + 1, nbatches), end="") model.train() if glob_step % lr_decay == 0 or glob_step == 1: lr_now = utils.lr_decay(optimizer, glob_step, lr_init, lr_decay, lr_gamma) #utils.lr_decay(refine_2d_model_optimizer, glob_step, lr_init, lr_decay, lr_gamma) enc_in = torch.from_numpy(encoder_inputs[i]).float() dec_out = torch.from_numpy(decoder_outputs[i]).float() inputs = Variable(enc_in.cuda()) targets = Variable(dec_out.cuda()) outputs = model(inputs) # calculate loss optimizer.zero_grad() step_loss = criterion(outputs, targets) step_loss.backward() if opt.max_norm: nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) #nn.utils.clip_grad_norm_(refine_2d_model.parameters(), max_norm=1) optimizer.step() loss += float(step_loss) current_step += 1 glob_step += 1 # === end looping through training batches === loss = loss / nbatches print("=============================\n" "Global step: %d\n" "Learning rate: %.2e\n" "Train loss avg: %.4f\n" "=============================" % (glob_step, lr_now, loss)) # === End training for an epoch === # clear useless chache torch.cuda.empty_cache() # === Testing after this epoch === model.eval() if opt.evaluateActionWise: print("{0:=^12} {1:=^6}".format("Action", "mm")) # line of 30 equal signs cum_err = 0 record = '' for action in actions: print("{0:<12} ".format(action), end="") # Get 2d and 3d testing data for this action action_test_set_2d = get_action_subset(test_set_2d, action) action_test_set_3d = get_action_subset(test_set_3d, action) encoder_inputs, decoder_outputs = get_all_batches( opt, action_test_set_2d, action_test_set_3d, training=False) total_err, joint_err, step_time = evaluate_batches( opt, criterion, model, data_mean_3d, data_std_3d, dim_to_use_3d, dim_to_ignore_3d, data_mean_2d, data_std_2d, dim_to_use_2d, dim_to_ignore_2d, current_step, encoder_inputs, decoder_outputs) cum_err = cum_err + total_err print("{0:>6.2f}".format(total_err)) record = record + "{} : {} (mm) \n".format( action, total_err) avg_val = cum_err / float(len(actions)) print("{0:<12} {1:>6.2f}".format("Average", avg_val)) print("{0:=^19}".format('')) f = open("records.txt", 'a') f.write("epoch: {} , avg_error: {} loss : {} \n".format( current_epoch, avg_val, loss)) if best_error > avg_val: print("=============================") print("==== save best record =====") print("=============================") best_error = avg_val # save ckpt file_path = os.path.join(opt.ckpt, 'ckpt_last.pth.tar') torch.save( { 'epoch': current_epoch, 'lr': lr_now, 'step': glob_step, 'err': avg_val, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, file_path) f.write("epoch: {} , avg_error: {} \n".format( current_epoch, best_error)) f.write(record) f.write("=======================================\n") f.close() else: n_joints = 17 if not (opt.predict_14) else 14 encoder_inputs, decoder_outputs = get_all_batches(opt, test_set_2d, test_set_3d, training=False) total_err, joint_err, step_time = evaluate_batches( opt, criterion, model, data_mean_3d, data_std_3d, dim_to_use_3d, dim_to_ignore_3d, data_mean_2d, data_std_2d, dim_to_use_2d, dim_to_ignore_2d, current_step, encoder_inputs, decoder_outputs, current_epoch) print("=============================\n" "Step-time (ms): %.4f\n" "Val loss avg: %.4f\n" "Val error avg (mm): %.2f\n" "=============================" % (1000 * step_time, loss, total_err)) for i in range(n_joints): # 6 spaces, right-aligned, 5 decimal places print("Error in joint {0:02d} (mm): {1:>5.2f}".format( i + 1, joint_err[i])) if save_flag is True: f.write("Error in joint {0:02d} (mm): {1:>5.2f} \n".format( i + 1, joint_err[i])) print("=============================") save_flag = False f.close() print("done in {0:.2f} ms".format(1000 * (time.time() - start_time))) # Reset global time and loss step_time, loss = 0, 0