def UpdateWorkspaceLr(self, cur_iter): """Updates the model's current learning rate and the workspace (learning rate and update history/momentum blobs). """ new_lr = lr_policy.get_lr_at_iter(cur_iter) if new_lr != self.current_lr: ratio = _get_lr_change_ratio(self.current_lr, new_lr) if ratio > 1.1: logger.info( 'Setting learning rate to {:.6f} at iteration {}'.format( new_lr, cur_iter)) self._SetNewLr(self.current_lr, new_lr)
def UpdateWorkspaceLr(self, cur_iter): """Updates the model's current learning rate and the workspace (learning rate and update history/momentum blobs). """ new_lr = lr_policy.get_lr_at_iter(cur_iter) if new_lr != self.current_lr: # avoid too noisy logging if new_lr / self.current_lr < 0.9 or new_lr / self.current_lr > 1.1: logger.info( 'Setting learning rate to {:.6f} at iteration {}'.format( new_lr, cur_iter)) self._SetNewLr(new_lr)
def train_model(): """Model training loop.""" # 创建模型 logger = logging.getLogger(__name__) model, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints # 迭代训练 setup_model_for_training(model, output_dir) training_stats = TrainingStats(model) # 追踪一些关键的训练统计数据 CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model # 保存最终的模型 checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads # 关闭数据加载线程 model.roi_data_loader.shutdown() return checkpoints
def UpdateWorkspaceLr(self, cur_iter): """Updates the model's current learning rate and the workspace (learning rate and update history/momentum blobs). """ # The workspace is the one source of truth for the lr # The lr is always the same on all GPUs cur_lr = workspace.FetchBlob('gpu_0/lr')[0] new_lr = lr_policy.get_lr_at_iter(cur_iter) # There are no type conversions between the lr in Python and the lr in # the GPU (both are float32), so exact comparision is ok if cur_lr != new_lr: ratio = _get_lr_change_ratio(cur_lr, new_lr) if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD: logger.info( 'Changing learning rate {:.6f} -> {:.6f} at iter {:d}'. format(cur_lr, new_lr, cur_iter)) self._SetNewLr(cur_lr, new_lr) return new_lr
def UpdateWorkspaceLr(self, cur_iter): """Updates the model's current learning rate and the workspace (learning rate and update history/momentum blobs). """ # The workspace is the one source of truth for the lr # The lr is always the same on all GPUs cur_lr = workspace.FetchBlob('gpu_0/lr')[0] new_lr = lr_policy.get_lr_at_iter(cur_iter) # There are no type conversions between the lr in Python and the lr in # the GPU (both are float32), so exact comparision is ok if cur_lr != new_lr: ratio = np.max((new_lr / cur_lr, cur_lr / new_lr)) if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD: logger.info( 'Changing learning rate {:.6f} -> {:.6f} at iter {:d}'. format(cur_lr, new_lr, cur_iter)) self._SetNewLr(cur_lr, new_lr) return new_lr
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter) ) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def SetCurrentLr(self, cur_iter): """Set the model's current learning rate without changing any blobs in the workspace. """ self.current_lr = lr_policy.get_lr_at_iter(cur_iter)