Exemple #1
0
def train_model():
    """Model training loop."""
    # 创建模型
    logger = logging.getLogger(__name__)
    model, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    # 迭代训练
    setup_model_for_training(model, output_dir)
    training_stats = TrainingStats(model)  # 追踪一些关键的训练统计数据
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    # 保存最终的模型
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    # 关闭数据加载线程
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #2
0
def main():
    args = parser.parse_args()
    print(args)
    # for now, batch_size should match number of gpus
    assert(args.batch_size==torch.cuda.device_count())

    # create model
    model = detector(arch=args.cnn_arch,
                 base_cnn_pkl_file=args.cnn_pkl,
                 mapping_file=args.cnn_mapping,
                 output_prob=False,
                 return_rois=False,
                 return_img_features=False)
    model = model.cuda()

    # freeze part of the net
    stop_grad=['conv1','bn1','relu','maxpool','layer1']
    model_no_grad=torch.nn.Sequential(*[getattr(model.model,l) for l in stop_grad])
    for param in model_no_grad.parameters():
        param.requires_grad = False

    # define  optimizer
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
                                lr=args.base_lr,
                                momentum=args.momentum,
                                weight_decay=args.wd)

    # create dataset
    train_dataset = CocoDataset(ann_file=args.dset_ann,
                          img_dir=args.dset_path,
                          proposal_file=args.dset_rois,
                          mode='train',
                          sample_transform=preprocess_sample(target_sizes=[800],
                                                             sample_proposals_for_training=True))
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size,shuffle=False, num_workers=args.workers, collate_fn=collate_custom)

    training_stats = TrainingStats(losses=['loss_cls','loss_bbox'],
                                   metrics=['accuracy_cls'],
                                   solver_max_iters=args.max_iter)

    iter = args.start_iter

    print('starting training')

    while iter<args.max_iter:
        for i, batch in enumerate(train_loader):

            if args.batch_size==1:
                batch = to_cuda_variable(batch,volatile=False)
            else:
                # when using multiple GPUs convert to cuda later in data_parallel and list_to_tensor
                batch = to_variable(batch,volatile=False)             
                

            # update lr
            lr = get_lr_at_iter(iter)
            adjust_learning_rate(optimizer, lr)

            # start measuring time
            training_stats.IterTic()

            # forward pass            
            if args.batch_size==1:
                cls_score,bbox_pred=model(batch['image'],batch['rois'])
                list_to_tensor = lambda x: x                
            else:
                cls_score,bbox_pred=data_parallel(model,(batch['image'],batch['rois'])) # run model distributed over gpus and concatenate outputs for all batch
                # convert gt data from lists to concatenated tensors
                list_to_tensor = lambda x: torch.cat(tuple([i.cuda() for i in x]),0)

            cls_labels = list_to_tensor(batch['labels_int32']).long()
            bbox_targets = list_to_tensor(batch['bbox_targets'])
            bbox_inside_weights = list_to_tensor(batch['bbox_inside_weights'])
            bbox_outside_weights = list_to_tensor(batch['bbox_outside_weights'])            
            
            # compute loss
            loss_cls=cross_entropy(cls_score,cls_labels)
            loss_bbox=smooth_L1(bbox_pred,bbox_targets,bbox_inside_weights,bbox_outside_weights)
                                  
            # compute classification accuracy (for stats reporting)
            acc = accuracy(cls_score,cls_labels)

            # get final loss
            loss = loss_cls + loss_bbox

            # update
            optimizer.zero_grad()
            loss.backward()
            # Without gradient clipping I get inf's and NaNs. 
            # it seems that in Caffe the SGD solver performs grad clipping by default. 
            # https://github.com/BVLC/caffe/blob/master/src/caffe/solvers/sgd_solver.cpp
            # it also seems that Matterport's Mask R-CNN required grad clipping as well 
            # (see README in https://github.com/matterport/Mask_RCNN)            
            # the value max_norm=35 was taken from here https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto
            clip_grad_norm(filter(lambda p: p.requires_grad, model.parameters()), max_norm=35, norm_type=2) 
            optimizer.step()

            # stats
            training_stats.IterToc()
            
            training_stats.UpdateIterStats(losses_dict={'loss_cls': loss_cls.data.cpu().numpy().item(),
                                                        'loss_bbox': loss_bbox.data.cpu().numpy().item()},
                                           metrics_dict={'accuracy_cls':acc.data.cpu().numpy().item()})

            training_stats.LogIterStats(iter, lr)
            # save checkpoint
            if (iter+1)%args.checkpoint_period == 0:
                save_checkpoint({
                    'iter': iter,
                    'args': args,
                    'state_dict': model.state_dict(),
                    'optimizer' : optimizer.state_dict(),
                }, args.checkpoint_fn)

            if iter == args.start_iter + 20: # training_stats.LOG_PERIOD=20
                # Reset the iteration timer to remove outliers from the first few
                # SGD iterations
                training_stats.ResetIterTimer()

            # allow finishing in the middle of an epoch
            if iter>args.max_iter:
                break
            # advance iteration
            iter+=1