def main(args): os.environ['CUDA_VISIBLE_DEVICES'] = '3' # create checkpoint dir if not isdir(args.checkpoint): mkdir_p(args.checkpoint) # create model model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained = False) model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion1 = torch.nn.MSELoss().cuda() # for Global loss criterion2 = torch.nn.MSELoss(reduce=False).cuda() # for refine loss optimizer = torch.optim.Adam(model.parameters(), lr = cfg.lr, weight_decay=cfg.weight_decay) if args.resume: if isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) pretrained_dict = checkpoint['state_dict'] model.load_state_dict(pretrained_dict) args.start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) logger = Logger(join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'LR', 'Train Loss']) cudnn.benchmark = True print(' Total params: %.2fMB' % (sum(p.numel() for p in model.parameters())/(1024*1024)*4)) train_loader = torch.utils.data.DataLoader( MscocoMulti(cfg), batch_size=cfg.batch_size*args.num_gpus, shuffle=True, num_workers=args.workers, pin_memory=True) for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch, cfg.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # train for one epoch train_loss = train(train_loader, model, [criterion1, criterion2], optimizer) print('train_loss: ',train_loss) # append logger file logger.append([epoch + 1, lr, train_loss]) save_model({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.close()
def train_model(model, dataloaders, optimizer, scheduler, num_epochs=1): since = time.time() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger = Logger(join(opt.output, f'log.txt')) best_loss = 1e5 # set to some large enough value criterion = Criterion() data_dict = dict() for epoch in range(num_epochs): scheduler.step() lr = scheduler.get_lr()[-1] print(f'Epoch: {epoch+1}/{num_epochs} LR: {lr:.3E}') data_dict['Epoch'] = epoch data_dict['LR'] = lr # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode # Iterate over data. batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() end = time.time() bar_name = 'Training' if phase=='train' else 'Testing ' num_batch = len(dataloaders[phase]) bar = Bar(bar_name, max=num_batch) # Iterate over data. for i,(inputs, targets) in enumerate(dataloaders[phase]): # measure data loading time data_time.update(time.time() - end) # move data to GPU inputs = inputs.to(device).float() targets = targets.to(device).float() # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) loss, _ = criterion.eval(outputs, targets) # measure accuracy and record loss loss_meter.update(loss.item(), inputs.shape[0]) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress bar.suffix = f'({i+1:04d}/{num_batch:04d}) Data: {data_time.val:.6f}s | Batch: {batch_time.val:.3f}s | Total: {bar.elapsed_td:} | ETA: {bar.eta_td:} | Loss: {loss_meter.avg:.4f}' bar.next() bar.finish() data_dict[f'{phase} Loss'] = loss_meter.avg # save last model if phase == 'train': save_model(model, join(opt.output, f'last.pth')) else: is_best = data_dict[f'val Loss'] < best_loss if is_best: best_loss = data_dict[f'val Loss'] save_model(model, join(opt.output, f'best.pth')) # update the log logger.update(data_dict)
def main(): args = parse_args() update_config(cfg_hrnet, args) # create checkpoint dir if not isdir(args.checkpoint): mkdir_p(args.checkpoint) # create model #print('networks.'+ cfg_hrnet.MODEL.NAME+'.get_pose_net') model = eval('models.' + cfg_hrnet.MODEL.NAME + '.get_pose_net')( cfg_hrnet, is_train=True) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() # show net args.channels = 3 args.height = cfg.data_shape[0] args.width = cfg.data_shape[1] #net_vision(model, args) # define loss function (criterion) and optimizer criterion = torch.nn.MSELoss(reduction='mean').cuda() #torch.optim.Adam optimizer = AdaBound(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay) if args.resume: if isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) pretrained_dict = checkpoint['state_dict'] model.load_state_dict(pretrained_dict) args.start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) logger = Logger(join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'LR', 'Train Loss']) cudnn.benchmark = True torch.backends.cudnn.enabled = True print(' Total params: %.2fMB' % (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4)) train_loader = torch.utils.data.DataLoader( #MscocoMulti(cfg), KPloader(cfg), batch_size=cfg.batch_size * len(args.gpus)) #, shuffle=True, #num_workers=args.workers, pin_memory=True) #for i, (img, targets, valid) in enumerate(train_loader): # print(i, img, targets, valid) for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch, cfg.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # train for one epoch train_loss = train(train_loader, model, criterion, optimizer) print('train_loss: ', train_loss) # append logger file logger.append([epoch + 1, lr, train_loss]) save_model( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.close()
def main(args): # import pdb; pdb.set_trace() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") print(device) writer = SummaryWriter(cfg.tensorboard_path) # create checkpoint dir counter = 0 if not isdir(args.checkpoint): mkdir_p(args.checkpoint) # create model model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained=True) model = torch.nn.DataParallel(model).to(device) # model = model.to(device) # define loss function (criterion) and optimizer criterion_bce = torch.nn.BCELoss().to(device) criterion_abs = torch.nn.L1Loss().to(device) # criterion_abs = offset_loss().to(device) # criterion1 = torch.nn.MSELoss().to(device) # for Global loss # criterion2 = torch.nn.MSELoss(reduce=False).to(device) # for refine loss optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay) if args.resume: print(args.resume) checkpoint_file_resume = os.path.join(args.checkpoint, args.resume + '.pth.tar') if isfile(checkpoint_file_resume): print("=> loading checkpoint '{}'".format(checkpoint_file_resume)) checkpoint = torch.load(checkpoint_file_resume) pretrained_dict = checkpoint['state_dict'] model.load_state_dict(pretrained_dict) args.start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file_resume, checkpoint['epoch'])) logger = Logger(join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format( checkpoint_file_resume)) else: logger = Logger(join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'LR', 'Train Loss']) cudnn.benchmark = True print(' Total params: %.2fMB' % (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4)) train_loader = torch.utils.data.DataLoader(MscocoMulti_double_only(cfg), batch_size=cfg.batch_size * args.num_gpus, shuffle=True, num_workers=args.workers, pin_memory=True) for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch, cfg.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # train for one epoch train_loss, counter = train(train_loader, model, [criterion_abs, criterion_bce], writer, counter, optimizer, device) print('train_loss: ', train_loss) # append logger file logger.append([epoch + 1, lr, train_loss]) save_model( { 'epoch': epoch + 1, 'info': cfg.info, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) writer.export_scalars_to_json("./test.json") writer.close() logger.close()
def main(args): """ Main training loop for training a stacked hourglass model on MPII dataset. :param args: Command line arguments. """ global best_acc # create checkpoint dir if not isdir(args.checkpoint_dir): mkdir_p(args.checkpoint_dir) # create model print("==> creating model '{}', stacks={}, blocks={}".format( args.arch, args.stacks, args.blocks)) model = HourglassNet(num_stacks=args.stacks, num_blocks=args.blocks, num_classes=args.num_classes, batch_norm_momentum=args.batch_norm_momentum, use_layer_norm=args.use_layer_norm, width=256, height=256) joint_visibility_model = JointVisibilityNet(hourglass_stacks=args.stacks) # scale weights if args.scale_weight_factor != 1.0: model.scale_weights_(args.scale_weight_factor) # setup horovod and model for parallel execution if args.use_horovod: hvd.init() torch.cuda.set_device(hvd.local_rank()) args.lr *= hvd.size() model.cuda() else: model = model.cuda() if args.predict_joint_visibility: joint_visibility_model = joint_visibility_model.cuda() # define loss function (criterion) and optimizer criterion = torch.nn.MSELoss(size_average=True).cuda() joint_visibility_criterion = None if not args.predict_joint_visibility else torch.nn.BCEWithLogitsLoss( ) params = [{'params': model.parameters(), 'lr': args.lr}] if args.predict_joint_visibility: params.append({ 'params': joint_visibility_model.parameters(), 'lr': args.lr }) params = model.parameters() if not args.use_amsprop: optimizer = torch.optim.RMSprop(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.weight_decay, amsgrad=True) if args.use_horovod: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # Create a tensorboard writer writer = SummaryWriter(log_dir="%s/hourglass_mpii_%s_tb_log" % (args.tb_dir, args.exp)) # optionally resume from a checkpoint title = 'mpii-' + args.arch if args.load: if isfile(args.load): print("=> loading checkpoint '{}'".format(args.load)) checkpoint = torch.load(args.load) # remove old usage of data parallel (used to be wrapped around model) # TODO: remove this when no old models used this state_dict = {} for key in checkpoint['state_dict']: new_key = key[len("module."):] if key.startswith( "module.") else key state_dict[new_key] = checkpoint['state_dict'][key] # restore state args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(state_dict) if args.predict_joint_visibility: joint_visibility_model.load_state_dict( checkpoint['joint_visibility_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.load, checkpoint['epoch'])) logger = Logger(join(args.checkpoint_dir, 'log.txt'), title=title, resume=True) else: raise Exception("=> no checkpoint found at '{}'".format(args.load)) else: logger = Logger(join(args.checkpoint_dir, 'log.txt'), title=title) logger.set_names( ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc']) cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # Data loading code train_dataset, train_loader, val_loader = _make_torch_data_loaders(args) if args.evaluate: print('\nEvaluation only') loss, acc, predictions = validate(val_loader, model, criterion, args.num_classes, args.debug, args.flip) save_pred(predictions, checkpoint=args.checkpoint_dir) return lr = args.lr for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # decay sigma if args.sigma_decay > 0: train_loader.dataset.sigma *= args.sigma_decay val_loader.dataset.sigma *= args.sigma_decay # train for one epoch train_loss, train_acc, joint_visibility_loss, joint_visibility_acc = train( train_loader, model=model, joint_visibility_model=joint_visibility_model, criterion=criterion, num_joints=args.num_classes, joint_visibility_criterion=joint_visibility_criterion, optimizer=optimizer, epoch=epoch, writer=writer, lr=lr, debug=args.debug, flip=args.flip, remove_intermediate_supervision=args. remove_intermediate_supervision, tb_freq=args.tb_log_freq, no_grad_clipping=args.no_grad_clipping, grad_clip=args.grad_clip, use_horovod=args.use_horovod, predict_joint_visibility=args.predict_joint_visibility, predict_joint_loss_coeff=args.joint_visibility_loss_coeff) # evaluate on validation set valid_loss, valid_acc_PCK, valid_acc_PCKh, valid_acc_PCKh_per_joint, valid_joint_visibility_loss, valid_joint_visibility_acc, predictions = validate( val_loader, model, joint_visibility_model, criterion, joint_visibility_criterion, args.num_classes, args.debug, args.flip, args.use_horovod, args.use_train_mode_to_eval, args.predict_joint_visibility) # append logger file, and write to tensorboard summaries writer.add_scalars('data/epoch/losses_wrt_epochs', { 'train_loss': train_loss, 'test_lost': valid_loss }, epoch) writer.add_scalar('data/epoch/train_accuracy_PCK', train_acc, epoch) writer.add_scalar('data/epoch/test_accuracy_PCK', valid_acc_PCK, epoch) writer.add_scalar('data/epoch/test_accuracy_PCKh', valid_acc_PCKh, epoch) for key in valid_acc_PCKh_per_joint: writer.add_scalar( 'per_joint_data/epoch/test_accuracy_PCKh_%s' % key, valid_acc_PCKh_per_joint[key], epoch) logger.append( [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc_PCK]) if args.predict_joint_visibility: writer.add_scalars( 'joint_visibility/epoch/loss', { 'train': joint_visibility_loss, 'test_lost': valid_joint_visibility_loss }, epoch) writer.add_scalars( 'joint_visibility/epoch/acc', { 'train': joint_visibility_acc, 'test_lost': valid_joint_visibility_acc }, epoch) # remember best acc and save checkpoint model_specific_checkpoint_dir = "%s/hourglass_mpii_%s" % ( args.checkpoint_dir, args.exp) if not isdir(model_specific_checkpoint_dir): mkdir_p(model_specific_checkpoint_dir) is_best = valid_acc_PCK > best_acc best_acc = max(valid_acc_PCK, best_acc) mean, stddev = train_dataset.get_mean_stddev() checkpoint = { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'mean': mean, 'stddev': stddev, } if args.predict_joint_visibility: checkpoint[ 'joint_visibility_state_dict'] = joint_visibility_model.state_dict( ) save_checkpoint(checkpoint, predictions, is_best, checkpoint=model_specific_checkpoint_dir) logger.close()
def main(): args = parse_args() # create checkpoint dir if not isdir(args.checkpoint): mkdir_p(args.checkpoint) # create model model = network.__dict__[cfg.model](cfg.channel_settings, cfg.output_shape, cfg.num_class, pretrained=True) # show net args.channels = 3 args.height = cfg.data_shape[0] args.width = cfg.data_shape[1] #net_vision(model, args) if 1: if isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] lr = checkpoint['lr'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) logger = Logger(join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: lr = cfg.lr logger = Logger(join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'LR', 'Train Loss']) # define loss function (criterion) and optimizer criterion1 = torch.nn.MSELoss().cuda() # for Global loss criterion2 = torch.nn.MSELoss(reduce=False).cuda() # for refine loss model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True torch.backends.cudnn.enabled = True print(' Total params: %.2fMB' % (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4)) train_loader = torch.utils.data.DataLoader( #MscocoMulti(cfg), KPloader(cfg), batch_size=cfg.batch_size * len(args.gpus)) #, shuffle=True, #num_workers=args.workers, pin_memory=True) #torch.optim.Adam optimizer = AdaBound(model.parameters(), lr=lr, weight_decay=cfg.weight_decay) for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch, cfg.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # train for one epoch train_loss = train(train_loader, model, [criterion1, criterion2], optimizer) print('train_loss: ', train_loss) # append logger file logger.append([epoch + 1, lr, train_loss]) #save_model({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'optimizer' : optimizer.state_dict(), #}, checkpoint=args.checkpoint) state_dict = model.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save({ 'epoch': epoch + 1, 'state_dict': state_dict, 'lr': lr, }, os.path.join(args.checkpoint, "epoch" + str(epoch + 1) + "checkpoint.ckpt")) print("=> Save model done! the path: ", \ os.path.join(args.checkpoint, "epoch" + str(epoch + 1) + "checkpoint.ckpt")) logger.close()