Python TrainingStats.SaveTrainingStates Examples

Programming Language: Python

Namespace/Package Name: detectron.utils.training_stats

Class/Type: TrainingStats

Method/Function: SaveTrainingStates

Examples at hotexamples.com: 2

Python TrainingStats.SaveTrainingStates - 2 examples found. These are the top rated real world Python examples of detectron.utils.training_stats.TrainingStats.SaveTrainingStates extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IterTic(14)

IterToc(14)

ResetIterTimer(14)

TrainingStats(14)

UpdateIterStats(14)

LogIterStats(13)

SaveTrainingStates(2)

Example #1

Show file

def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    writer = SummaryWriter(log_dir=output_dir)
    training_stats = TrainingStats(model, writer)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats(cur_iter)
        training_stats.LogIterStats(cur_iter, lr)
        writer.add_scalar('learning_rate', lr, cur_iter)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # save train loss and metric
    state_file = os.path.join(output_dir, 'training_state.json')
    training_stats.SaveTrainingStates(state_file)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints

Example #2

Show file

File: train_stage2.py Project: 994374821/maskrcnn_body25

def main():
    # Initialize C2
    workspace.GlobalInit(
        ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1'])
    # Set up logging and load config options
    logger = setup_logging(__name__)
    logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO)
    args = parse_args()
    logger.info('Called with args:')
    logger.info(args)
    if args.cfg_file is not None:
        merge_cfg_from_file(args.cfg_file)
    if args.opts is not None:
        merge_cfg_from_list(args.opts)
    assert_and_infer_cfg()
    smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info()
    logger.info("cuda version : {}".format(cuda_ver))
    logger.info("cudnn version: {}".format(cudnn_ver))
    logger.info("nvidia-smi output:\n{}".format(smi_output))
    logger.info('Training with config:')
    logger.info(pprint.pformat(cfg))
    # Note that while we set the numpy random seed network training will not be
    # deterministic in general. There are sources of non-determinism that cannot
    # be removed with a reasonble execution-speed tradeoff (such as certain
    # non-deterministic cudnn functions).
    np.random.seed(cfg.RNG_SEED)
    # test model
    logger.info("creat test model ...")
    test_model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS,
                                                       gpu_id=0)
    logger.info("created test model ...")
    train_data = DataLoader(root,
                            "train_id.txt",
                            cfg,
                            test_model,
                            is_train=True)
    # creat mode
    model, weights_file, start_iter, checkpoints = create_model(
        True, cfg, output_dir)
    # test blob
    print(workspace.Blobs())
    # create input blob
    blob_names = ['data_stage2', 'gt_label_stage2']
    for gpu_id in range(cfg.NUM_GPUS):
        with c2_utils.NamedCudaScope(gpu_id):
            for blob_name in blob_names:
                workspace.CreateBlob(core.ScopedName(blob_name))
    # Override random weight initialization with weights from a saved model
    if weights_file:
        nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)

    writer = SummaryWriter(log_dir=output_dir)
    training_stats = TrainingStats(model, writer)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    logger.info("start train ...")
    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        # feed data
        # print("{} iter starting feed data...".format(cur_iter))
        data_stage2, gt_label = train_data.next_batch()
        with c2_utils.NamedCudaScope(gpu_id):
            workspace.FeedBlob(core.ScopedName('data_stage2'), data_stage2)
            workspace.FeedBlob(core.ScopedName('gt_label_stage2'), gt_label)

        # print("workspace.RunNet(model.net.Proto().name)")
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats(cur_iter)
        training_stats.LogIterStats(cur_iter, lr)
        writer.add_scalar('learning_rate', lr, cur_iter)

        # print("end of RunNet")
        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # save train loss and metric
    state_file = os.path.join(output_dir, 'training_state.json')
    training_stats.SaveTrainingStates(state_file)
    # Execute the training run
    checkpoints = detectron.utils.train.train_model()
    # Test the trained model
    if not args.skip_test:
        test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)