コード例 #1
0
ファイル: uda_base.py プロジェクト: X-funbean/fast-reid
 def build_writers(self):
     """
     Build a list of writers to be used. By default it contains
     writers that write metrics to the screen,
     a json file, and a tensorboard event file respectively.
     If you'd like a different list of writers, you can overwrite it in
     your trainer.
     Returns:
         list[EventWriter]: a list of :class:`EventWriter` objects.
     It is now implemented by:
     .. code-block:: python
         return [
             CommonMetricPrinter(self.max_iter),
             JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
             TensorboardXWriter(self.cfg.OUTPUT_DIR),
         ]
     """
     # Assume the default print/log frequency.
     # TODO: customize my writers
     return [
         # It may not always print what you want to see, since it prints "common" metrics only.
         CommonMetricPrinter(self.max_iter),
         JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
         TensorboardXWriter(self.cfg.OUTPUT_DIR),
     ]
コード例 #2
0
ファイル: train_hpo.py プロジェクト: yonghenglh6/fast-reid
    def build_hooks(self):
        r"""
        Build a list of default hooks, including timing, evaluation,
        checkpointing, lr scheduling, precise BN, writing events.
        Returns:
            list[HookBase]:
        """
        cfg = self.cfg.clone()
        cfg.defrost()

        ret = [
            hooks.IterationTimer(),
            hooks.LRScheduler(self.optimizer, self.scheduler),
        ]

        if cfg.MODEL.FREEZE_LAYERS != [''] and cfg.SOLVER.FREEZE_ITERS > 0:
            freeze_layers = ",".join(cfg.MODEL.FREEZE_LAYERS)
            logger.info(
                f'Freeze layer group "{freeze_layers}" training for {cfg.SOLVER.FREEZE_ITERS:d} iterations'
            )
            ret.append(
                hooks.FreezeLayer(
                    self.model,
                    self.optimizer,
                    cfg.MODEL.FREEZE_LAYERS,
                    cfg.SOLVER.FREEZE_ITERS,
                ))

        def test_and_save_results():
            self._last_eval_results = self.test(self.cfg, self.model)
            return self._last_eval_results

        # Do evaluation after checkpointer, because then if it fails,
        # we can use the saved checkpoint to debug.
        ret.append(TuneReportHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))

        # run writers in the end, so that evaluation metrics are written
        ret.append(
            hooks.PeriodicWriter([CommonMetricPrinter(self.max_iter)], 200))

        return ret
コード例 #3
0
    def build_hooks(self):
        r"""
        Build a list of default hooks, including timing, evaluation,
        checkpointing, lr scheduling, precise BN, writing events.
        Returns:
            list[HookBase]:
        """
        cfg = self.cfg.clone()
        cfg.defrost()

        ret = [
            hooks.IterationTimer(),
            hooks.LRScheduler(self.optimizer, self.scheduler),
        ]

        ret.append(
            hooks.LayerFreeze(
                self.model,
                cfg.MODEL.FREEZE_LAYERS,
                cfg.SOLVER.FREEZE_ITERS,
                cfg.SOLVER.FREEZE_FC_ITERS,
            ))

        def test_and_save_results():
            self._last_eval_results = self.test(self.cfg, self.model)
            return self._last_eval_results

        # Do evaluation after checkpointer, because then if it fails,
        # we can use the saved checkpoint to debug.
        ret.append(TuneReportHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))

        if comm.is_main_process():
            # run writers in the end, so that evaluation metrics are written
            ret.append(
                hooks.PeriodicWriter([CommonMetricPrinter(self.max_iter)],
                                     200))

        return ret
コード例 #4
0
ファイル: plain_train_net.py プロジェクト: zl930216/fast-reid
def do_train(cfg, model, resume=False):
    data_loader = build_reid_train_loader(cfg)

    model.train()
    optimizer = build_optimizer(cfg, model)

    iters_per_epoch = len(data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH
    scheduler = build_lr_scheduler(cfg, optimizer, iters_per_epoch)

    checkpointer = Checkpointer(model,
                                cfg.OUTPUT_DIR,
                                save_to_disk=comm.is_main_process(),
                                optimizer=optimizer**scheduler)

    start_epoch = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("epoch", -1) + 1)
    iteration = start_iter = start_epoch * iters_per_epoch

    max_epoch = cfg.SOLVER.MAX_EPOCH
    max_iter = max_epoch * iters_per_epoch
    warmup_iters = cfg.SOLVER.WARMUP_ITERS
    delay_epochs = cfg.SOLVER.DELAY_EPOCHS

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_epoch)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR)
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support some hooks, such as
    # accurate timing, FP16 training and precise BN here,
    # because they are not trivial to implement in a small training loop
    logger.info("Start training from epoch {}".format(start_epoch))
    with EventStorage(start_iter) as storage:
        for epoch in range(start_epoch, max_epoch):
            storage.epoch = epoch
            for data, _ in zip(data_loader, range(iters_per_epoch)):
                storage.iter = iteration

                loss_dict = model(data)
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                if comm.is_main_process():
                    storage.put_scalars(total_loss=losses_reduced,
                                        **loss_dict_reduced)

                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                storage.put_scalar("lr",
                                   optimizer.param_groups[0]["lr"],
                                   smoothing_hint=False)

                if iteration - start_iter > 5 and (
                    (iteration + 1) % 200 == 0 or iteration == max_iter - 1):
                    for writer in writers:
                        writer.write()

                iteration += 1

                if iteration <= warmup_iters:
                    scheduler["warmup_sched"].step()

            # Write metrics after each epoch
            for writer in writers:
                writer.write()

            if iteration > warmup_iters and (epoch + 1) >= delay_epochs:
                scheduler["lr_sched"].step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (epoch + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and epoch != max_iter - 1):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage

            periodic_checkpointer.step(epoch)