def test_restore_checkpoint():
    # Create Model
    model = model_builder.create(cfg.MODEL.TYPE, train=True)
    add_momentum_init_ops(model)
    init_weights(model)
    # Fill input blobs
    roidb = combined_roidb_for_training(
        cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES
    )
    model_builder.add_training_inputs(model, roidb=roidb)
    workspace.CreateNet(model.net)
    # Bookkeeping for checkpoint creation
    iter_num = 0
    checkpoints = {}
    output_dir = get_output_dir(cfg.TRAIN.DATASETS, training=True)
    chk_file_path = os.path.join(output_dir, 'model_iter{}.pkl'.format(iter_num))
    checkpoints[iter_num] = chk_file_path
    # Save model weights
    nu.save_model_to_weights_file(checkpoints[iter_num], model)
    orig_gpu_0_params, orig_all_params = get_params(model)
    # Change the model weights
    init_weights(model)
    # Reload the weights in the model
    nu.initialize_gpu_from_weights_file(model, chk_file_path, gpu_id=0)
    nu.broadcast_parameters(model)
    shutil.rmtree(cfg.OUTPUT_DIR)
    _, restored_all_params = get_params(model)
    # Check if all params are loaded correctly
    for scoped_name, blob in orig_all_params.items():
        np.testing.assert_array_equal(blob, restored_all_params[scoped_name])
    # Check if broadcast_parameters works
    for scoped_name, blob in restored_all_params.items():
        unscoped_name = c2_utils.UnscopeName(scoped_name)
        np.testing.assert_array_equal(blob, orig_gpu_0_params[unscoped_name])
Ejemplo n.º 2
0
def setup_model_for_training(model, output_dir):
    """Loaded saved weights and create the network in the C2 workspace."""
    logger = logging.getLogger(__name__)
    add_model_training_inputs(model)

    if cfg.TRAIN.WEIGHTS:
        # Override random weight initialization with weights from a saved model
        nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)

    # Start loading mini-batches and enqueuing blobs
    model.roi_data_loader.register_sigint_handler()
    model.roi_data_loader.start(prefill=True)
    return output_dir
Ejemplo n.º 3
0
def setup_model_for_training(model, output_dir):
    """Loaded saved weights and create the network in the C2 workspace."""
    logger = logging.getLogger(__name__)
    add_model_training_inputs(model)

    if cfg.TRAIN.WEIGHTS:
        # Override random weight initialization with weights from a saved model
        nu.initialize_gpu_from_weights_file(model, cfg.TRAIN.WEIGHTS, gpu_id=0)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)

    # Start loading mini-batches and enqueuing blobs
    model.roi_data_loader.register_sigint_handler()
    model.roi_data_loader.start(prefill=True)
    return output_dir
def init_weights(model):
    # init weights in gpu_id = 0 and then broadcast
    workspace.RunNetOnce(model.param_init_net)
    nu.broadcast_parameters(model)
Ejemplo n.º 5
0
def net_trainer():
    model, start_iter, checkpoints = create_model()
    if 'final' in checkpoints:
        return checkpoints

    add_model_inputs(model)

    if cfg.TRAIN.WEIGHTS:
        nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    output_dir = get_output_dir(training=True)
    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)
    json_out_file = os.path.join(output_dir, 'json_stats.log')

    # Start loading mini-batches and enqueuing blobs
    model.roi_data_loader.register_sigint_handler()
    # DEBUG data loading
    if cfg.DEBUG.DATA_LOADING:
        for _ in range(10000000):
            # this was with threading...
            # model.roi_data_loader._get_next_minibatch()
            model.roi_data_loader._get_next_minibatch2(
                model.roi_data_loader.shared_readonly_dict,
                model.roi_data_loader._lock,
                model.roi_data_loader.mp_cur,
                model.roi_data_loader.mp_perm)
        sys.exit(0)
    model.roi_data_loader.start(prefill=True)

    smoothed_values = {
        key: SmoothedValue(WIN_SZ) for key in model.losses + model.metrics}
    iter_values = {key: 0 for key in model.losses + model.metrics}
    total_loss = SmoothedValue(WIN_SZ)
    iter_time = SmoothedValue(WIN_SZ)
    mb_qsize = SmoothedValue(WIN_SZ)
    iter_timer = Timer()
    checkpoints = {}
    for i in range(start_iter, cfg.SOLVER.MAX_ITER):
        iter_timer.tic()
        lr = model.UpdateWorkspaceLr(i)
        workspace.RunNet(model.net.Proto().name)
        if i == start_iter:
            nu.print_net(model)
        iter_time.AddValue(iter_timer.toc(average=False))
        for k in iter_values.keys():
            if k in model.losses:
                iter_values[k] = nu.sum_multi_gpu_blob(k)
            else:
                iter_values[k] = nu.average_multi_gpu_blob(k)
        for k, v in smoothed_values.items():
            v.AddValue(iter_values[k])
        loss = np.sum(np.array([iter_values[k] for k in model.losses]))
        total_loss.AddValue(loss)
        mb_qsize.AddValue(model.roi_data_loader._minibatch_queue.qsize())

        if i % LOG_PERIOD == 0 or i == cfg.SOLVER.MAX_ITER - 1:
            eta_seconds = iter_timer.average_time * (cfg.SOLVER.MAX_ITER - i)
            eta = str(datetime.timedelta(seconds=int(eta_seconds)))
            mem_stats = c2_utils.GetGPUMemoryUsageStats()
            mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS])
            stats = dict(
                iter=i,
                lr=float(lr),
                time=iter_timer.average_time,
                loss=total_loss.GetMedianValue(),
                eta=eta,
                mb_qsize=int(np.round(mb_qsize.GetMedianValue())),
                mem=int(np.ceil(mem_usage / 1024 / 1024)))
            for k, v in smoothed_values.items():
                stats[k] = v.GetMedianValue()
            log_json_stats(stats, json_out_file=json_out_file)
        if cfg.DEBUG.STOP_TRAIN_ITER:
            import pdb
            pdb.set_trace()

        if ((i + 1) % int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) == 0 and
                i > start_iter):
            checkpoints[i] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(i))
            nu.save_model_to_weights_file(checkpoints[i], model)

        if i == start_iter + LOG_PERIOD:
            # Reset the iter timer after the first LOG_PERIOD iterations to
            # discard initial iterations that have outlier timings
            iter_timer.reset()

        if np.isnan(loss):
            logger.critical('Loss is NaN, exiting...')
            os._exit(0)  # FB: use code 0 to avoid flow retries

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints