def update_stats(self): """ Update the model with precise statistics. Users can manually call this method. """ if self._disabled: return if self._data_iter is None: self._data_iter = iter(self._data_loader) def data_loader(): for num_iter in itertools.count(1): if num_iter % 100 == 0: self._logger.info( "Running precise-BN ... {}/{} iterations.".format( num_iter, self._num_iter)) # This way we can reuse the same iterator yield next(self._data_iter) with EventStorage(): # capture events in a new storage to discard them self._logger.info( "Running precise-BN for {} iterations... ".format( self._num_iter) + "Note that this could produce different statistics every time." ) update_bn_stats(self._model, data_loader(), self._num_iter)
def train(self, start_epoch: int, max_epoch: int, iters_per_epoch: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from epoch {}".format(start_epoch)) self.iter = self.start_iter = start_epoch * iters_per_epoch with EventStorage(self.start_iter) as self.storage: try: self.before_train() for self.epoch in range(start_epoch, max_epoch): self.before_epoch() for _ in range(iters_per_epoch): self.before_step() self.run_step() self.after_step() self.iter += 1 self.after_epoch() except Exception: logger.exception("Exception during training:") raise finally: self.after_train()
def train(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: self.before_train() # check hooks.py, engine/defaults.py for self.iter in range(start_iter, max_iter): self.before_step() if self.cfg.META.DATA.NAMES == '': self.run_step() else: self.run_step_meta_learning() self.after_step() self.after_train()
def train(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: try: self.before_train() for self.iter in range(start_iter, max_iter): self.before_step() self.run_step() self.after_step() except Exception: logger.exception("Exception during training:") finally: self.after_train()
def do_train(cfg, model, resume=False): data_loader = build_reid_train_loader(cfg) model.train() optimizer = build_optimizer(cfg, model) iters_per_epoch = len(data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH scheduler = build_lr_scheduler(cfg, optimizer, iters_per_epoch) checkpointer = Checkpointer(model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), optimizer=optimizer**scheduler) start_epoch = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("epoch", -1) + 1) iteration = start_iter = start_epoch * iters_per_epoch max_epoch = cfg.SOLVER.MAX_EPOCH max_iter = max_epoch * iters_per_epoch warmup_iters = cfg.SOLVER.WARMUP_ITERS delay_epochs = cfg.SOLVER.DELAY_EPOCHS periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_epoch) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR) ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support some hooks, such as # accurate timing, FP16 training and precise BN here, # because they are not trivial to implement in a small training loop logger.info("Start training from epoch {}".format(start_epoch)) with EventStorage(start_iter) as storage: for epoch in range(start_epoch, max_epoch): storage.epoch = epoch for data, _ in zip(data_loader, range(iters_per_epoch)): storage.iter = iteration loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) if iteration - start_iter > 5 and ( (iteration + 1) % 200 == 0 or iteration == max_iter - 1): for writer in writers: writer.write() iteration += 1 if iteration <= warmup_iters: scheduler["warmup_sched"].step() # Write metrics after each epoch for writer in writers: writer.write() if iteration > warmup_iters and (epoch + 1) >= delay_epochs: scheduler["lr_sched"].step() if (cfg.TEST.EVAL_PERIOD > 0 and (epoch + 1) % cfg.TEST.EVAL_PERIOD == 0 and epoch != max_iter - 1): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage periodic_checkpointer.step(epoch)