コード例 #1
0
def train_model():
    """Model training loop."""
    # 创建模型
    logger = logging.getLogger(__name__)
    model, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    # 迭代训练
    setup_model_for_training(model, output_dir)
    training_stats = TrainingStats(model)  # 追踪一些关键的训练统计数据
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    # 保存最终的模型
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    # 关闭数据加载线程
    model.roi_data_loader.shutdown()
    return checkpoints
コード例 #2
0
ファイル: train_net.py プロジェクト: Alphonses/Detectron
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter)
            )
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
コード例 #3
0
def net_trainer():
    model, start_iter, checkpoints = create_model()
    if 'final' in checkpoints:
        return checkpoints

    add_model_inputs(model)

    if cfg.TRAIN.WEIGHTS:
        nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    output_dir = get_output_dir(training=True)
    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)
    json_out_file = os.path.join(output_dir, 'json_stats.log')

    # Start loading mini-batches and enqueuing blobs
    model.roi_data_loader.register_sigint_handler()
    # DEBUG data loading
    if cfg.DEBUG.DATA_LOADING:
        for _ in range(10000000):
            # this was with threading...
            # model.roi_data_loader._get_next_minibatch()
            model.roi_data_loader._get_next_minibatch2(
                model.roi_data_loader.shared_readonly_dict,
                model.roi_data_loader._lock,
                model.roi_data_loader.mp_cur,
                model.roi_data_loader.mp_perm)
        sys.exit(0)
    model.roi_data_loader.start(prefill=True)

    smoothed_values = {
        key: SmoothedValue(WIN_SZ) for key in model.losses + model.metrics}
    iter_values = {key: 0 for key in model.losses + model.metrics}
    total_loss = SmoothedValue(WIN_SZ)
    iter_time = SmoothedValue(WIN_SZ)
    mb_qsize = SmoothedValue(WIN_SZ)
    iter_timer = Timer()
    checkpoints = {}
    for i in range(start_iter, cfg.SOLVER.MAX_ITER):
        iter_timer.tic()
        lr = model.UpdateWorkspaceLr(i)
        workspace.RunNet(model.net.Proto().name)
        if i == start_iter:
            nu.print_net(model)
        iter_time.AddValue(iter_timer.toc(average=False))
        for k in iter_values.keys():
            if k in model.losses:
                iter_values[k] = nu.sum_multi_gpu_blob(k)
            else:
                iter_values[k] = nu.average_multi_gpu_blob(k)
        for k, v in smoothed_values.items():
            v.AddValue(iter_values[k])
        loss = np.sum(np.array([iter_values[k] for k in model.losses]))
        total_loss.AddValue(loss)
        mb_qsize.AddValue(model.roi_data_loader._minibatch_queue.qsize())

        if i % LOG_PERIOD == 0 or i == cfg.SOLVER.MAX_ITER - 1:
            eta_seconds = iter_timer.average_time * (cfg.SOLVER.MAX_ITER - i)
            eta = str(datetime.timedelta(seconds=int(eta_seconds)))
            mem_stats = c2_utils.GetGPUMemoryUsageStats()
            mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS])
            stats = dict(
                iter=i,
                lr=float(lr),
                time=iter_timer.average_time,
                loss=total_loss.GetMedianValue(),
                eta=eta,
                mb_qsize=int(np.round(mb_qsize.GetMedianValue())),
                mem=int(np.ceil(mem_usage / 1024 / 1024)))
            for k, v in smoothed_values.items():
                stats[k] = v.GetMedianValue()
            log_json_stats(stats, json_out_file=json_out_file)
        if cfg.DEBUG.STOP_TRAIN_ITER:
            import pdb
            pdb.set_trace()

        if ((i + 1) % int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) == 0 and
                i > start_iter):
            checkpoints[i] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(i))
            nu.save_model_to_weights_file(checkpoints[i], model)

        if i == start_iter + LOG_PERIOD:
            # Reset the iter timer after the first LOG_PERIOD iterations to
            # discard initial iterations that have outlier timings
            iter_timer.reset()

        if np.isnan(loss):
            logger.critical('Loss is NaN, exiting...')
            os._exit(0)  # FB: use code 0 to avoid flow retries

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints