def train_model(): """Model training loop.""" # 创建模型 logger = logging.getLogger(__name__) model, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints # 迭代训练 setup_model_for_training(model, output_dir) training_stats = TrainingStats(model) # 追踪一些关键的训练统计数据 CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model # 保存最终的模型 checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads # 关闭数据加载线程 model.roi_data_loader.shutdown() return checkpoints
def main(): args = parser.parse_args() print(args) # for now, batch_size should match number of gpus assert(args.batch_size==torch.cuda.device_count()) # create model model = detector(arch=args.cnn_arch, base_cnn_pkl_file=args.cnn_pkl, mapping_file=args.cnn_mapping, output_prob=False, return_rois=False, return_img_features=False) model = model.cuda() # freeze part of the net stop_grad=['conv1','bn1','relu','maxpool','layer1'] model_no_grad=torch.nn.Sequential(*[getattr(model.model,l) for l in stop_grad]) for param in model_no_grad.parameters(): param.requires_grad = False # define optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.base_lr, momentum=args.momentum, weight_decay=args.wd) # create dataset train_dataset = CocoDataset(ann_file=args.dset_ann, img_dir=args.dset_path, proposal_file=args.dset_rois, mode='train', sample_transform=preprocess_sample(target_sizes=[800], sample_proposals_for_training=True)) train_loader = DataLoader(train_dataset, batch_size=args.batch_size,shuffle=False, num_workers=args.workers, collate_fn=collate_custom) training_stats = TrainingStats(losses=['loss_cls','loss_bbox'], metrics=['accuracy_cls'], solver_max_iters=args.max_iter) iter = args.start_iter print('starting training') while iter<args.max_iter: for i, batch in enumerate(train_loader): if args.batch_size==1: batch = to_cuda_variable(batch,volatile=False) else: # when using multiple GPUs convert to cuda later in data_parallel and list_to_tensor batch = to_variable(batch,volatile=False) # update lr lr = get_lr_at_iter(iter) adjust_learning_rate(optimizer, lr) # start measuring time training_stats.IterTic() # forward pass if args.batch_size==1: cls_score,bbox_pred=model(batch['image'],batch['rois']) list_to_tensor = lambda x: x else: cls_score,bbox_pred=data_parallel(model,(batch['image'],batch['rois'])) # run model distributed over gpus and concatenate outputs for all batch # convert gt data from lists to concatenated tensors list_to_tensor = lambda x: torch.cat(tuple([i.cuda() for i in x]),0) cls_labels = list_to_tensor(batch['labels_int32']).long() bbox_targets = list_to_tensor(batch['bbox_targets']) bbox_inside_weights = list_to_tensor(batch['bbox_inside_weights']) bbox_outside_weights = list_to_tensor(batch['bbox_outside_weights']) # compute loss loss_cls=cross_entropy(cls_score,cls_labels) loss_bbox=smooth_L1(bbox_pred,bbox_targets,bbox_inside_weights,bbox_outside_weights) # compute classification accuracy (for stats reporting) acc = accuracy(cls_score,cls_labels) # get final loss loss = loss_cls + loss_bbox # update optimizer.zero_grad() loss.backward() # Without gradient clipping I get inf's and NaNs. # it seems that in Caffe the SGD solver performs grad clipping by default. # https://github.com/BVLC/caffe/blob/master/src/caffe/solvers/sgd_solver.cpp # it also seems that Matterport's Mask R-CNN required grad clipping as well # (see README in https://github.com/matterport/Mask_RCNN) # the value max_norm=35 was taken from here https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto clip_grad_norm(filter(lambda p: p.requires_grad, model.parameters()), max_norm=35, norm_type=2) optimizer.step() # stats training_stats.IterToc() training_stats.UpdateIterStats(losses_dict={'loss_cls': loss_cls.data.cpu().numpy().item(), 'loss_bbox': loss_bbox.data.cpu().numpy().item()}, metrics_dict={'accuracy_cls':acc.data.cpu().numpy().item()}) training_stats.LogIterStats(iter, lr) # save checkpoint if (iter+1)%args.checkpoint_period == 0: save_checkpoint({ 'iter': iter, 'args': args, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, args.checkpoint_fn) if iter == args.start_iter + 20: # training_stats.LOG_PERIOD=20 # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() # allow finishing in the middle of an epoch if iter>args.max_iter: break # advance iteration iter+=1