def train_model(): """Model training loop.""" # 创建模型 logger = logging.getLogger(__name__) model, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints # 迭代训练 setup_model_for_training(model, output_dir) training_stats = TrainingStats(model) # 追踪一些关键的训练统计数据 CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model # 保存最终的模型 checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads # 关闭数据加载线程 model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter) ) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def net_trainer(): model, start_iter, checkpoints = create_model() if 'final' in checkpoints: return checkpoints add_model_inputs(model) if cfg.TRAIN.WEIGHTS: nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS) # Even if we're randomly initializing we still need to synchronize # parameters across GPUs nu.broadcast_parameters(model) workspace.CreateNet(model.net) output_dir = get_output_dir(training=True) logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) dump_proto_files(model, output_dir) json_out_file = os.path.join(output_dir, 'json_stats.log') # Start loading mini-batches and enqueuing blobs model.roi_data_loader.register_sigint_handler() # DEBUG data loading if cfg.DEBUG.DATA_LOADING: for _ in range(10000000): # this was with threading... # model.roi_data_loader._get_next_minibatch() model.roi_data_loader._get_next_minibatch2( model.roi_data_loader.shared_readonly_dict, model.roi_data_loader._lock, model.roi_data_loader.mp_cur, model.roi_data_loader.mp_perm) sys.exit(0) model.roi_data_loader.start(prefill=True) smoothed_values = { key: SmoothedValue(WIN_SZ) for key in model.losses + model.metrics} iter_values = {key: 0 for key in model.losses + model.metrics} total_loss = SmoothedValue(WIN_SZ) iter_time = SmoothedValue(WIN_SZ) mb_qsize = SmoothedValue(WIN_SZ) iter_timer = Timer() checkpoints = {} for i in range(start_iter, cfg.SOLVER.MAX_ITER): iter_timer.tic() lr = model.UpdateWorkspaceLr(i) workspace.RunNet(model.net.Proto().name) if i == start_iter: nu.print_net(model) iter_time.AddValue(iter_timer.toc(average=False)) for k in iter_values.keys(): if k in model.losses: iter_values[k] = nu.sum_multi_gpu_blob(k) else: iter_values[k] = nu.average_multi_gpu_blob(k) for k, v in smoothed_values.items(): v.AddValue(iter_values[k]) loss = np.sum(np.array([iter_values[k] for k in model.losses])) total_loss.AddValue(loss) mb_qsize.AddValue(model.roi_data_loader._minibatch_queue.qsize()) if i % LOG_PERIOD == 0 or i == cfg.SOLVER.MAX_ITER - 1: eta_seconds = iter_timer.average_time * (cfg.SOLVER.MAX_ITER - i) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict( iter=i, lr=float(lr), time=iter_timer.average_time, loss=total_loss.GetMedianValue(), eta=eta, mb_qsize=int(np.round(mb_qsize.GetMedianValue())), mem=int(np.ceil(mem_usage / 1024 / 1024))) for k, v in smoothed_values.items(): stats[k] = v.GetMedianValue() log_json_stats(stats, json_out_file=json_out_file) if cfg.DEBUG.STOP_TRAIN_ITER: import pdb pdb.set_trace() if ((i + 1) % int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) == 0 and i > start_iter): checkpoints[i] = os.path.join( output_dir, 'model_iter{}.pkl'.format(i)) nu.save_model_to_weights_file(checkpoints[i], model) if i == start_iter + LOG_PERIOD: # Reset the iter timer after the first LOG_PERIOD iterations to # discard initial iterations that have outlier timings iter_timer.reset() if np.isnan(loss): logger.critical('Loss is NaN, exiting...') os._exit(0) # FB: use code 0 to avoid flow retries # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints