Exemple #1
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
                # checkpoint after scheduler to properly save the state of scheduler
                hooks.PeriodicCheckpointer(checkpointer, 10),
            ])

            trainer.train(0, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
            self.assertEqual(scheduler.last_epoch, 12)
            del trainer

            opt = torch.optim.SGD(model.parameters(), 999)  # lr will be loaded
            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
            ])
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(
                trainer.iter,
                11)  # last finished iter number (0-based in Trainer)
            # number of times `scheduler.step()` was called (1-based)
            self.assertEqual(scheduler.last_epoch, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks(
                [
                    hooks.PeriodicCheckpointer(checkpointer, 10),
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )

            trainer.train(0, 12)
            del trainer

            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks(
                [
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(trainer.iter, 11)  # last finished iter
            self.assertEqual(scheduler.last_epoch, 11)
Exemple #3
0
    def training_step(self, batch, batch_idx):
        data_time = time.perf_counter() - self.data_start
        # Need to manually enter/exit since trainer may launch processes
        # This ideally belongs in setup, but setup seems to run before processes are spawned
        if self.storage is None:
            self.storage = EventStorage(0)
            self.storage.__enter__()
            self.iteration_timer.trainer = weakref.proxy(self)
            self.iteration_timer.before_step()
            self.writers = (
                default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
                if comm.is_main_process()
                else {}
            )

        loss_dict = self.model(batch)
        SimpleTrainer.write_metrics(loss_dict, data_time)

        opt = self.optimizers()
        self.storage.put_scalar(
            "lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False
        )
        self.iteration_timer.after_step()
        self.storage.step()
        # A little odd to put before step here, but it's the best way to get a proper timing
        self.iteration_timer.before_step()

        if self.storage.iter % 20 == 0:
            for writer in self.writers:
                writer.write()
        return sum(loss_dict.values())
    def __init__(self, cfg, optimization_level):
        logger = logging.getLogger("detectron2")
        if not logger.isEnabledFor(logging.INFO):
            setup_logger()
        model = self.build_model(cfg)
        optimizer = self.build_optimizer(cfg, model)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=optimization_level)
        data_loader = self.build_train_loader(cfg)

        SimpleTrainer.__init__(self, model, data_loader, optimizer)

        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
        self.checkpointer = DetectionCheckpointer(
            model,
            cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=self.scheduler,
        )
        self.start_iter = 0
        self.max_iter = cfg.SOLVER.MAX_ITER
        self.cfg = cfg

        self.register_hooks(self.build_hooks())
    def test_simple_trainer(self, device="cpu"):
        device = torch.device(device)
        model = SimpleModel(nn.Linear(10, 10)).to(device)

        def data_loader():
            while True:
                yield torch.rand(3, 3).to(device)

        trainer = SimpleTrainer(model, data_loader(),
                                torch.optim.SGD(model.parameters(), 0.1))
        trainer.train(0, 10)
Exemple #6
0
    def __init__(self, cfg, weights: Union[str, Dict[str, Any]]):
        self.start_iter = 0
        self.max_iter = cfg.SOLVER.MAX_ITER
        self.cfg = cfg

        # We do not make any super call here and implement `__init__` from
        #  `DefaultTrainer`: we need to initialize mixed precision model before
        # wrapping to DDP, so we need to do it this way.
        model = self.build_model(cfg)
        optimizer = self.build_optimizer(cfg, model)
        data_loader = self.build_train_loader(cfg)
        scheduler = self.build_lr_scheduler(cfg, optimizer)

        # Load pre-trained weights before wrapping to DDP because `ApexDDP` has
        # some weird issue with `DetectionCheckpointer`.
        # fmt: off
        if isinstance(weights, str):
            # weights are ``str`` means ImageNet init or resume training.
            self.start_iter = (DetectionCheckpointer(
                model, optimizer=optimizer,
                scheduler=scheduler).resume_or_load(weights, resume=True).get(
                    "iteration", -1) + 1)
        elif isinstance(weights, dict):
            # weights are a state dict means our pretrain init.
            DetectionCheckpointer(model)._load_model(weights)
        # fmt: on

        # Enable distributed training if we have multiple GPUs. Use Apex DDP for
        # non-FPN backbones because its `delay_allreduce` functionality helps with
        # gradient checkpointing.
        if dist.get_world_size() > 1:
            if global_cfg.get("GRADIENT_CHECKPOINT", False):
                model = ApexDDP(model, delay_allreduce=True)
            else:
                model = nn.parallel.DistributedDataParallel(
                    model,
                    device_ids=[dist.get_rank()],
                    broadcast_buffers=False)

        # Call `__init__` from grandparent class: `SimpleTrainer`.
        SimpleTrainer.__init__(self, model, data_loader, optimizer)

        self.scheduler = scheduler
        self.checkpointer = DetectionCheckpointer(model,
                                                  cfg.OUTPUT_DIR,
                                                  optimizer=optimizer,
                                                  scheduler=self.scheduler)
        self.register_hooks(self.build_hooks())
def benchmark_train(args):
    cfg = setup(args)
    model = build_model(cfg)
    logger.info("Model:\n{}".format(model))
    if comm.get_world_size() > 1:
        model = DistributedDataParallel(
            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
        )
    optimizer = build_optimizer(cfg, model)
    checkpointer = DetectionCheckpointer(model, optimizer=optimizer)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    cfg.defrost()
    cfg.DATALOADER.NUM_WORKERS = 0
    data_loader = build_detection_train_loader(cfg)
    dummy_data = list(itertools.islice(data_loader, 100))

    def f():
        while True:
            yield from DatasetFromList(dummy_data, copy=False)

    max_iter = 400
    trainer = SimpleTrainer(model, f(), optimizer)
    trainer.register_hooks(
        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
    )
    trainer.train(1, max_iter)
Exemple #8
0
    def test_writer_hooks(self):
        model = _SimpleModel(sleep_sec=0.1)
        trainer = SimpleTrainer(model, self._data_loader("cpu"),
                                torch.optim.SGD(model.parameters(), 0.1))

        max_iter = 50

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            json_file = os.path.join(d, "metrics.json")
            writers = [CommonMetricPrinter(max_iter), JSONWriter(json_file)]

            trainer.register_hooks([
                hooks.EvalHook(0, lambda: {"metric": 100}),
                hooks.PeriodicWriter(writers)
            ])
            with self.assertLogs(writers[0].logger) as logs:
                trainer.train(0, max_iter)

            with open(json_file, "r") as f:
                data = [json.loads(line.strip()) for line in f]
                self.assertEqual([x["iteration"] for x in data],
                                 [19, 39, 49, 50])
                # the eval metric is in the last line with iter 50
                self.assertIn("metric", data[-1],
                              "Eval metric must be in last line of JSON!")

            # test logged messages from CommonMetricPrinter
            self.assertEqual(len(logs.output), 3)
            for log, iter in zip(logs.output, [19, 39, 49]):
                self.assertIn(f"iter: {iter}", log)

            self.assertIn("eta: 0:00:00", logs.output[-1],
                          "Last ETA must be 0!")
Exemple #9
0
 def test_best_checkpointer(self):
     model = _SimpleModel()
     dataloader = self._data_loader("cpu")
     opt = torch.optim.SGD(model.parameters(), 0.1)
     metric_name = "metric"
     total_iter = 40
     test_period = 10
     test_cases = [
         ("max", iter([0.3, 0.4, 0.35, 0.5]), 3),
         ("min", iter([1.0, 0.8, 0.9, 0.9]), 2),
         ("min", iter([math.nan, 0.8, 0.9, 0.9]), 1),
     ]
     for mode, metrics, call_count in test_cases:
         trainer = SimpleTrainer(model, dataloader, opt)
         with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
             checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
             trainer.register_hooks([
                 hooks.EvalHook(test_period,
                                lambda: {metric_name: next(metrics)}),
                 hooks.BestCheckpointer(test_period,
                                        checkpointer,
                                        metric_name,
                                        mode=mode),
             ])
             with mock.patch.object(checkpointer,
                                    "save") as mock_save_method:
                 trainer.train(0, total_iter)
                 self.assertEqual(mock_save_method.call_count, call_count)
Exemple #10
0
    def test_build_model(self):
        cfg = self._get_default_cfg()
        cfg.INPUT.MIN_SIZE_TRAIN = (60, )
        cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True
        cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3
        cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5
        cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset", )

        cfg.MODEL.DEVICE = "cpu"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator"

        with register_toy_coco_dataset(
                "toy_dataset",
                image_size=(80, 60),  # w, h
                num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG,
        ):
            model = self.runner.build_model(cfg)
            trainer = SimpleTrainer(model, data_loader=[], optimizer=None)
            trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)]
            trainer.register_hooks(trainer_hooks)
            trainer.before_train()
            anchor_generator = model.proposal_generator.anchor_generator
            cell_anchors = list(anchor_generator.cell_anchors)
            gt_anchors = np.array([
                [-20, -15, 20, 15]  # toy_dataset's bbox is half size of image
                for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS)
            ])
            np.testing.assert_allclose(cell_anchors[0], gt_anchors)
Exemple #11
0
    def test_build_model(self):
        cfg = self._get_default_cfg()
        cfg.INPUT.MIN_SIZE_TRAIN = (60,)
        cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True
        cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3
        cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5
        cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset",)

        cfg.MODEL.DEVICE = "cpu"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator"

        with make_temp_directory("detectron2go_tmp_dataset") as dataset_dir:
            image_dir = os.path.join(dataset_dir, "images")
            os.makedirs(image_dir)
            image_generator = LocalImageGenerator(image_dir, width=80, height=60)
            with register_toy_dataset(
                "toy_dataset",
                image_generator,
                num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG,
            ):
                model = self.runner.build_model(cfg)
                trainer = SimpleTrainer(model, data_loader=[], optimizer=None)
                trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)]
                trainer.register_hooks(trainer_hooks)
                trainer.before_train()
                anchor_generator = model.proposal_generator.anchor_generator
                cell_anchors = [x for x in anchor_generator.cell_anchors]
                gt_anchors = np.array(
                    [
                        [-20, -15, 20, 15]  # toy_dataset's bbox is half size of image
                        for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS)
                    ]
                )
                np.testing.assert_allclose(cell_anchors[0], gt_anchors)
    def test_eval_hook(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)

        for total_iter, period, eval_count in [(30, 15, 2), (31, 15, 3), (20, 0, 1)]:
            test_func = mock.Mock(return_value={"metric": 3.0})
            trainer = SimpleTrainer(model, dataloader, opt)
            trainer.register_hooks([hooks.EvalHook(period, test_func)])
            trainer.train(0, total_iter)
            self.assertEqual(test_func.call_count, eval_count)
Exemple #13
0
 def test_simple_trainer(self, device="cpu"):
     model = _SimpleModel().to(device=device)
     trainer = SimpleTrainer(model, self._data_loader(device),
                             torch.optim.SGD(model.parameters(), 0.1))
     trainer.train(0, 10)