def benchmark_train(args):
    cfg = setup(args)
    model = build_model(cfg)
    logger.info("Model:\n{}".format(model))
    if comm.get_world_size() > 1:
        model = DistributedDataParallel(
            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
        )
    optimizer = build_optimizer(cfg, model)
    checkpointer = DetectionCheckpointer(model, optimizer=optimizer)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    cfg.defrost()
    cfg.DATALOADER.NUM_WORKERS = 0
    data_loader = build_detection_train_loader(cfg)
    dummy_data = list(itertools.islice(data_loader, 100))

    def f():
        while True:
            yield from DatasetFromList(dummy_data, copy=False)

    max_iter = 400
    trainer = SimpleTrainer(model, f(), optimizer)
    trainer.register_hooks(
        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
    )
    trainer.train(1, max_iter)
Exemple #2
0
 def test_best_checkpointer(self):
     model = _SimpleModel()
     dataloader = self._data_loader("cpu")
     opt = torch.optim.SGD(model.parameters(), 0.1)
     metric_name = "metric"
     total_iter = 40
     test_period = 10
     test_cases = [
         ("max", iter([0.3, 0.4, 0.35, 0.5]), 3),
         ("min", iter([1.0, 0.8, 0.9, 0.9]), 2),
         ("min", iter([math.nan, 0.8, 0.9, 0.9]), 1),
     ]
     for mode, metrics, call_count in test_cases:
         trainer = SimpleTrainer(model, dataloader, opt)
         with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
             checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
             trainer.register_hooks([
                 hooks.EvalHook(test_period,
                                lambda: {metric_name: next(metrics)}),
                 hooks.BestCheckpointer(test_period,
                                        checkpointer,
                                        metric_name,
                                        mode=mode),
             ])
             with mock.patch.object(checkpointer,
                                    "save") as mock_save_method:
                 trainer.train(0, total_iter)
                 self.assertEqual(mock_save_method.call_count, call_count)
Exemple #3
0
    def test_writer_hooks(self):
        model = _SimpleModel(sleep_sec=0.1)
        trainer = SimpleTrainer(model, self._data_loader("cpu"),
                                torch.optim.SGD(model.parameters(), 0.1))

        max_iter = 50

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            json_file = os.path.join(d, "metrics.json")
            writers = [CommonMetricPrinter(max_iter), JSONWriter(json_file)]

            trainer.register_hooks([
                hooks.EvalHook(0, lambda: {"metric": 100}),
                hooks.PeriodicWriter(writers)
            ])
            with self.assertLogs(writers[0].logger) as logs:
                trainer.train(0, max_iter)

            with open(json_file, "r") as f:
                data = [json.loads(line.strip()) for line in f]
                self.assertEqual([x["iteration"] for x in data],
                                 [19, 39, 49, 50])
                # the eval metric is in the last line with iter 50
                self.assertIn("metric", data[-1],
                              "Eval metric must be in last line of JSON!")

            # test logged messages from CommonMetricPrinter
            self.assertEqual(len(logs.output), 3)
            for log, iter in zip(logs.output, [19, 39, 49]):
                self.assertIn(f"iter: {iter}", log)

            self.assertIn("eta: 0:00:00", logs.output[-1],
                          "Last ETA must be 0!")
Exemple #4
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
                # checkpoint after scheduler to properly save the state of scheduler
                hooks.PeriodicCheckpointer(checkpointer, 10),
            ])

            trainer.train(0, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
            self.assertEqual(scheduler.last_epoch, 12)
            del trainer

            opt = torch.optim.SGD(model.parameters(), 999)  # lr will be loaded
            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
            ])
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(
                trainer.iter,
                11)  # last finished iter number (0-based in Trainer)
            # number of times `scheduler.step()` was called (1-based)
            self.assertEqual(scheduler.last_epoch, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks(
                [
                    hooks.PeriodicCheckpointer(checkpointer, 10),
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )

            trainer.train(0, 12)
            del trainer

            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks(
                [
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(trainer.iter, 11)  # last finished iter
            self.assertEqual(scheduler.last_epoch, 11)
Exemple #6
0
    def test_build_model(self):
        cfg = self._get_default_cfg()
        cfg.INPUT.MIN_SIZE_TRAIN = (60, )
        cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True
        cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3
        cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5
        cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset", )

        cfg.MODEL.DEVICE = "cpu"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator"

        with register_toy_coco_dataset(
                "toy_dataset",
                image_size=(80, 60),  # w, h
                num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG,
        ):
            model = self.runner.build_model(cfg)
            trainer = SimpleTrainer(model, data_loader=[], optimizer=None)
            trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)]
            trainer.register_hooks(trainer_hooks)
            trainer.before_train()
            anchor_generator = model.proposal_generator.anchor_generator
            cell_anchors = list(anchor_generator.cell_anchors)
            gt_anchors = np.array([
                [-20, -15, 20, 15]  # toy_dataset's bbox is half size of image
                for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS)
            ])
            np.testing.assert_allclose(cell_anchors[0], gt_anchors)
Exemple #7
0
    def test_build_model(self):
        cfg = self._get_default_cfg()
        cfg.INPUT.MIN_SIZE_TRAIN = (60,)
        cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True
        cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3
        cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5
        cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset",)

        cfg.MODEL.DEVICE = "cpu"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator"

        with make_temp_directory("detectron2go_tmp_dataset") as dataset_dir:
            image_dir = os.path.join(dataset_dir, "images")
            os.makedirs(image_dir)
            image_generator = LocalImageGenerator(image_dir, width=80, height=60)
            with register_toy_dataset(
                "toy_dataset",
                image_generator,
                num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG,
            ):
                model = self.runner.build_model(cfg)
                trainer = SimpleTrainer(model, data_loader=[], optimizer=None)
                trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)]
                trainer.register_hooks(trainer_hooks)
                trainer.before_train()
                anchor_generator = model.proposal_generator.anchor_generator
                cell_anchors = [x for x in anchor_generator.cell_anchors]
                gt_anchors = np.array(
                    [
                        [-20, -15, 20, 15]  # toy_dataset's bbox is half size of image
                        for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS)
                    ]
                )
                np.testing.assert_allclose(cell_anchors[0], gt_anchors)
    def test_eval_hook(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)

        for total_iter, period, eval_count in [(30, 15, 2), (31, 15, 3), (20, 0, 1)]:
            test_func = mock.Mock(return_value={"metric": 3.0})
            trainer = SimpleTrainer(model, dataloader, opt)
            trainer.register_hooks([hooks.EvalHook(period, test_func)])
            trainer.train(0, total_iter)
            self.assertEqual(test_func.call_count, eval_count)