def UpdateWorkspaceLr(self, cur_iter):
     """Updates the model's current learning rate and the workspace (learning
     rate and update history/momentum blobs).
     """
     new_lr = lr_policy.get_lr_at_iter(cur_iter)
     if new_lr != self.current_lr:
         ratio = _get_lr_change_ratio(self.current_lr, new_lr)
         if ratio > 1.1:
             logger.info(
                 'Setting learning rate to {:.6f} at iteration {}'.format(
                     new_lr, cur_iter))
         self._SetNewLr(self.current_lr, new_lr)
 def UpdateWorkspaceLr(self, cur_iter):
     """Updates the model's current learning rate and the workspace (learning
     rate and update history/momentum blobs).
     """
     new_lr = lr_policy.get_lr_at_iter(cur_iter)
     if new_lr != self.current_lr:
         # avoid too noisy logging
         if new_lr / self.current_lr < 0.9 or new_lr / self.current_lr > 1.1:
             logger.info(
                 'Setting learning rate to {:.6f} at iteration {}'.format(
                     new_lr, cur_iter))
         self._SetNewLr(new_lr)
Beispiel #3
0
def train_model():
    """Model training loop."""
    # 创建模型
    logger = logging.getLogger(__name__)
    model, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    # 迭代训练
    setup_model_for_training(model, output_dir)
    training_stats = TrainingStats(model)  # 追踪一些关键的训练统计数据
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    # 保存最终的模型
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    # 关闭数据加载线程
    model.roi_data_loader.shutdown()
    return checkpoints
Beispiel #4
0
 def UpdateWorkspaceLr(self, cur_iter):
     """Updates the model's current learning rate and the workspace (learning
     rate and update history/momentum blobs).
     """
     # The workspace is the one source of truth for the lr
     # The lr is always the same on all GPUs
     cur_lr = workspace.FetchBlob('gpu_0/lr')[0]
     new_lr = lr_policy.get_lr_at_iter(cur_iter)
     # There are no type conversions between the lr in Python and the lr in
     # the GPU (both are float32), so exact comparision is ok
     if cur_lr != new_lr:
         ratio = _get_lr_change_ratio(cur_lr, new_lr)
         if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD:
             logger.info(
                 'Changing learning rate {:.6f} -> {:.6f} at iter {:d}'.
                 format(cur_lr, new_lr, cur_iter))
         self._SetNewLr(cur_lr, new_lr)
     return new_lr
Beispiel #5
0
 def UpdateWorkspaceLr(self, cur_iter):
     """Updates the model's current learning rate and the workspace (learning
     rate and update history/momentum blobs).
     """
     # The workspace is the one source of truth for the lr
     # The lr is always the same on all GPUs
     cur_lr = workspace.FetchBlob('gpu_0/lr')[0]
     new_lr = lr_policy.get_lr_at_iter(cur_iter)
     # There are no type conversions between the lr in Python and the lr in
     # the GPU (both are float32), so exact comparision is ok
     if cur_lr != new_lr:
         ratio = np.max((new_lr / cur_lr, cur_lr / new_lr))
         if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD:
             logger.info(
                 'Changing learning rate {:.6f} -> {:.6f} at iter {:d}'.
                 format(cur_lr, new_lr, cur_iter))
         self._SetNewLr(cur_lr, new_lr)
     return new_lr
Beispiel #6
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter)
            )
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
 def SetCurrentLr(self, cur_iter):
     """Set the model's current learning rate without changing any blobs in
     the workspace.
     """
     self.current_lr = lr_policy.get_lr_at_iter(cur_iter)