Example #1
0
 def UpdateIterStats(self):
     """Update tracked iteration statistics."""
     for k in self.losses_and_metrics.keys():
         if k in self.model.losses:
             self.losses_and_metrics[k] = nu.sum_multi_gpu_blob(k)
         else:
             self.losses_and_metrics[k] = nu.average_multi_gpu_blob(k)
     for k, v in self.smoothed_losses_and_metrics.items():
         v.AddValue(self.losses_and_metrics[k])
     self.iter_total_loss = np.sum(
         np.array([self.losses_and_metrics[k] for k in self.model.losses]))
     self.smoothed_total_loss.AddValue(self.iter_total_loss)
     self.smoothed_mb_qsize.AddValue(
         self.model.roi_data_loader._minibatch_queue.qsize())
Example #2
0
 def UpdateIterStats(self):
     """Update tracked iteration statistics."""
     for k in self.losses_and_metrics.keys():
         if k in self.model.losses:
             self.losses_and_metrics[k] = nu.sum_multi_gpu_blob(k)
         else:
             self.losses_and_metrics[k] = nu.average_multi_gpu_blob(k)
     for k, v in self.smoothed_losses_and_metrics.items():
         v.AddValue(self.losses_and_metrics[k])
     self.iter_total_loss = np.sum(
         np.array([self.losses_and_metrics[k] for k in self.model.losses])
     )
     self.smoothed_total_loss.AddValue(self.iter_total_loss)
     self.smoothed_mb_qsize.AddValue(
         self.model.roi_data_loader._minibatch_queue.qsize()
     )
Example #3
0
def net_trainer():
    model, start_iter, checkpoints = create_model()
    if 'final' in checkpoints:
        return checkpoints

    add_model_inputs(model)

    if cfg.TRAIN.WEIGHTS:
        nu.initialize_gpu_0_from_weights_file(model, cfg.TRAIN.WEIGHTS)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    output_dir = get_output_dir(training=True)
    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)
    json_out_file = os.path.join(output_dir, 'json_stats.log')

    # Start loading mini-batches and enqueuing blobs
    model.roi_data_loader.register_sigint_handler()
    # DEBUG data loading
    if cfg.DEBUG.DATA_LOADING:
        for _ in range(10000000):
            # this was with threading...
            # model.roi_data_loader._get_next_minibatch()
            model.roi_data_loader._get_next_minibatch2(
                model.roi_data_loader.shared_readonly_dict,
                model.roi_data_loader._lock,
                model.roi_data_loader.mp_cur,
                model.roi_data_loader.mp_perm)
        sys.exit(0)
    model.roi_data_loader.start(prefill=True)

    smoothed_values = {
        key: SmoothedValue(WIN_SZ) for key in model.losses + model.metrics}
    iter_values = {key: 0 for key in model.losses + model.metrics}
    total_loss = SmoothedValue(WIN_SZ)
    iter_time = SmoothedValue(WIN_SZ)
    mb_qsize = SmoothedValue(WIN_SZ)
    iter_timer = Timer()
    checkpoints = {}
    for i in range(start_iter, cfg.SOLVER.MAX_ITER):
        iter_timer.tic()
        lr = model.UpdateWorkspaceLr(i)
        workspace.RunNet(model.net.Proto().name)
        if i == start_iter:
            nu.print_net(model)
        iter_time.AddValue(iter_timer.toc(average=False))
        for k in iter_values.keys():
            if k in model.losses:
                iter_values[k] = nu.sum_multi_gpu_blob(k)
            else:
                iter_values[k] = nu.average_multi_gpu_blob(k)
        for k, v in smoothed_values.items():
            v.AddValue(iter_values[k])
        loss = np.sum(np.array([iter_values[k] for k in model.losses]))
        total_loss.AddValue(loss)
        mb_qsize.AddValue(model.roi_data_loader._minibatch_queue.qsize())

        if i % LOG_PERIOD == 0 or i == cfg.SOLVER.MAX_ITER - 1:
            eta_seconds = iter_timer.average_time * (cfg.SOLVER.MAX_ITER - i)
            eta = str(datetime.timedelta(seconds=int(eta_seconds)))
            mem_stats = c2_utils.GetGPUMemoryUsageStats()
            mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS])
            stats = dict(
                iter=i,
                lr=float(lr),
                time=iter_timer.average_time,
                loss=total_loss.GetMedianValue(),
                eta=eta,
                mb_qsize=int(np.round(mb_qsize.GetMedianValue())),
                mem=int(np.ceil(mem_usage / 1024 / 1024)))
            for k, v in smoothed_values.items():
                stats[k] = v.GetMedianValue()
            log_json_stats(stats, json_out_file=json_out_file)
        if cfg.DEBUG.STOP_TRAIN_ITER:
            import pdb
            pdb.set_trace()

        if ((i + 1) % int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) == 0 and
                i > start_iter):
            checkpoints[i] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(i))
            nu.save_model_to_weights_file(checkpoints[i], model)

        if i == start_iter + LOG_PERIOD:
            # Reset the iter timer after the first LOG_PERIOD iterations to
            # discard initial iterations that have outlier timings
            iter_timer.reset()

        if np.isnan(loss):
            logger.critical('Loss is NaN, exiting...')
            os._exit(0)  # FB: use code 0 to avoid flow retries

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints