Exemple #1
0
    def test_build_model(self, tmp_dir):
        cfg = self._get_cfg(tmp_dir)
        cfg.MODEL_EMA.ENABLED = True
        task = GeneralizedRCNNTask(cfg)
        trainer = self._get_trainer(tmp_dir)

        with EventStorage() as storage:
            task.storage = storage
            trainer.fit(task)

        # test building untrained model
        model = GeneralizedRCNNTask.build_model(cfg)
        self.assertTrue(model.training)

        # test loading regular weights
        with temp_defrost(cfg):
            cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt")
            model = GeneralizedRCNNTask.build_model(cfg, eval_only=True)
            self.assertFalse(model.training)
            self.assertTrue(
                self._compare_state_dict(model.state_dict(),
                                         task.model.state_dict()))

        # test loading EMA weights
        with temp_defrost(cfg):
            cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt")
            cfg.MODEL_EMA.USE_EMA_WEIGHTS_FOR_EVAL_ONLY = True
            model = GeneralizedRCNNTask.build_model(cfg, eval_only=True)
            self.assertFalse(model.training)
            self.assertTrue(
                self._compare_state_dict(model.state_dict(),
                                         task.ema_state.state_dict()))
Exemple #2
0
def setup_after_launch(cfg, output_dir, runner):
    """
    Set things up after entering DDP, including
        - creating working directory
        - setting up logger
        - logging environment
        - initializing runner
    """
    create_dir_on_global_main_process(output_dir)
    comm.synchronize()
    setup_loggers(output_dir)
    cfg.freeze()
    if cfg.OUTPUT_DIR != output_dir:
        with temp_defrost(cfg):
            logger.warning(
                "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".
                format(cfg.OUTPUT_DIR, output_dir))
            cfg.OUTPUT_DIR = output_dir
    logger.info("Initializing runner ...")
    runner = initialize_runner(runner, cfg)

    log_info(cfg, runner)
    dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))

    auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
Exemple #3
0
def maybe_override_output_dir(cfg: CfgNode, output_dir: str):
    if cfg.OUTPUT_DIR != output_dir:
        with temp_defrost(cfg):
            logger.warning(
                "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".
                format(cfg.OUTPUT_DIR, output_dir))
            cfg.OUTPUT_DIR = output_dir
Exemple #4
0
def launch(
        main_func,
        num_processes_per_machine,
        num_machines=1,
        machine_rank=0,
        dist_url=None,
        backend="NCCL",
        always_spawn=False,
        args=(),
):
    logger.info(
        f"Launch with num_processes_per_machine: {num_processes_per_machine},"
        f" num_machines: {num_machines}, machine_rank: {machine_rank},"
        f" dist_url: {dist_url}, backend: {backend}.")

    if get_launch_environment() == "local" and not torch.cuda.is_available():
        assert len(args) > 0, args
        cfg = args[0]
        assert isinstance(cfg, CfgNode)
        if cfg.MODEL.DEVICE == "cuda":
            logger.warning(
                "Detected that CUDA is not available on this machine, set MODEL.DEVICE"
                " to cpu and backend to GLOO")
            with temp_defrost(cfg):
                cfg.MODEL.DEVICE = "cpu"
            backend = "GLOO"

    if backend == "NCCL":
        assert (
            num_processes_per_machine <= torch.cuda.device_count()
        ), "num_processes_per_machine is greater than device count: {} vs {}".format(
            num_processes_per_machine, torch.cuda.device_count())

    world_size = num_machines * num_processes_per_machine
    if world_size > 1 or always_spawn:
        # https://github.com/pytorch/pytorch/pull/14391
        # TODO prctl in spawned processes
        prefix = f"detectron2go_{main_func.__module__}.{main_func.__name__}_return"
        with tempfile.NamedTemporaryFile(prefix=prefix, suffix=".pth") as f:
            return_file = f.name
            mp.spawn(
                _distributed_worker,
                nprocs=num_processes_per_machine,
                args=(
                    main_func,
                    world_size,
                    num_processes_per_machine,
                    machine_rank,
                    dist_url,
                    backend,
                    return_file,
                    args,
                ),
                daemon=False,
            )
            if machine_rank == 0:
                return torch.load(return_file)
    else:
        return main_func(*args)
Exemple #5
0
    def test_qat(self, tmp_dir):
        @META_ARCH_REGISTRY.register()
        class QuantizableDetMetaArchForTest(mah.DetMetaArchForTest):
            custom_config_dict = {"preserved_attributes": ["preserved_attr"]}

            def __init__(self, cfg):
                super().__init__(cfg)
                self.avgpool.preserved_attr = "foo"
                self.avgpool.not_preserved_attr = "bar"

            def prepare_for_quant(self, cfg):
                example_inputs = (torch.rand(1, 3, 3, 3), )
                self.avgpool = prepare_qat_fx(
                    self.avgpool,
                    {
                        "":
                        set_backend_and_create_qconfig(cfg,
                                                       is_train=self.training)
                    },
                    example_inputs,
                    self.custom_config_dict,
                )
                return self

            def prepare_for_quant_convert(self, cfg):
                self.avgpool = convert_fx(
                    self.avgpool,
                    convert_custom_config_dict=self.custom_config_dict)
                return self

        cfg = self._get_cfg(tmp_dir)
        cfg.MODEL.META_ARCHITECTURE = "QuantizableDetMetaArchForTest"
        cfg.QUANTIZATION.QAT.ENABLED = True
        task = GeneralizedRCNNTask(cfg)

        callbacks = [
            QuantizationAwareTraining.from_config(cfg),
            ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR, save_last=True),
        ]
        trainer = pl.Trainer(
            max_steps=1,
            limit_train_batches=1,
            num_sanity_val_steps=0,
            callbacks=callbacks,
            logger=False,
        )
        with EventStorage() as storage:
            task.storage = storage
            trainer.fit(task)
        prepared_avgpool = task._prepared.model.avgpool
        self.assertEqual(prepared_avgpool.preserved_attr, "foo")
        self.assertFalse(hasattr(prepared_avgpool, "not_preserved_attr"))

        with temp_defrost(cfg):
            cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt")
            model = GeneralizedRCNNTask.build_model(cfg, eval_only=True)
            self.assertTrue(isinstance(model.avgpool, torch.fx.GraphModule))
Exemple #6
0
def main(
    cfg,
    output_dir,
    runner,
    # binary specific optional arguments
    predictor_types: typing.List[str],
    device: str = "cpu",
    compare_accuracy: bool = False,
    skip_if_fail: bool = False,
):
    if compare_accuracy:
        raise NotImplementedError(
            "compare_accuracy functionality isn't currently supported.")
        # NOTE: dict for metrics of all exported models (and original pytorch model)
        # ret["accuracy_comparison"] = accuracy_comparison

    cfg = copy.deepcopy(cfg)
    setup_after_launch(cfg, output_dir, runner)

    with temp_defrost(cfg):
        cfg.merge_from_list(["MODEL.DEVICE", device])
    model = runner.build_model(cfg, eval_only=True)

    # NOTE: train dataset is used to avoid leakage since the data might be used for
    # running calibration for quantization. test_loader is used to make sure it follows
    # the inference behaviour (augmentation will not be applied).
    datasets = list(cfg.DATASETS.TRAIN)
    data_loader = runner.build_detection_test_loader(cfg, datasets)

    logger.info("Running the pytorch model and print FLOPS ...")
    first_batch = next(iter(data_loader))
    input_args = (first_batch, )
    flops_utils.print_model_flops(model, input_args)

    predictor_paths: typing.Dict[str, str] = {}
    for typ in predictor_types:
        # convert_and_export_predictor might alter the model, copy before calling it
        pytorch_model = copy.deepcopy(model)
        try:
            predictor_path = convert_and_export_predictor(
                cfg,
                pytorch_model,
                typ,
                output_dir,
                data_loader,
            )
            logger.info(
                f"Predictor type {typ} has been exported to {predictor_path}")
            predictor_paths[typ] = predictor_path
        except Exception as e:
            logger.exception(f"Export {typ} predictor failed: {e}")
            if not skip_if_fail:
                raise e

    ret = {"predictor_paths": predictor_paths, "accuracy_comparison": {}}

    return ret
Exemple #7
0
def update_cfg_from_pb_model(cfg, model):
    """
    Update cfg statically based given caffe2 model, in cast that there's conflict
    between caffe2 model and the cfg, caffe2 model has higher priority.
    """
    with temp_defrost(cfg):
        _update_if_true(cfg, "MODEL.MASK_ON", infer_mask_on(model))
        _update_if_true(cfg, "MODEL.KEYPOINT_ON", infer_keypoint_on(model))
        _update_if_true(cfg, "MODEL.DENSEPOSE_ON", infer_densepose_on(model))
    return cfg
    def test_build_model(self, tmp_dir):
        cfg = self._get_cfg(tmp_dir)
        cfg.MODEL_EMA.ENABLED = True
        task = GeneralizedRCNNTask(cfg)
        checkpoint_callback = ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR,
                                              save_last=True)

        trainer = pl.Trainer(
            max_steps=1,
            limit_train_batches=1,
            num_sanity_val_steps=0,
            callbacks=[checkpoint_callback],
        )

        with EventStorage() as storage:
            task.storage = storage
            trainer.fit(task)

        # test building untrained model
        model = GeneralizedRCNNTask.build_model(cfg)
        self.assertTrue(model.training)

        # test loading regular weights
        with temp_defrost(cfg):
            cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt")
            model = GeneralizedRCNNTask.build_model(cfg, eval_only=True)
            self.assertFalse(model.training)
            self.assertTrue(
                self._compare_state_dict(model.state_dict(),
                                         task.model.state_dict()))

        # test loading EMA weights
        with temp_defrost(cfg):
            cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt")
            cfg.MODEL_EMA.USE_EMA_WEIGHTS_FOR_EVAL_ONLY = True
            model = GeneralizedRCNNTask.build_model(cfg, eval_only=True)
            self.assertFalse(model.training)
            self.assertTrue(
                self._compare_state_dict(model.state_dict(),
                                         task.ema_state.state_dict()))
Exemple #9
0
def update_cfg_if_using_adhoc_dataset(cfg):
    if cfg.D2GO_DATA.DATASETS.TRAIN_CATEGORIES:
        new_train_datasets = [
            COCOWithClassesToUse(name, cfg.D2GO_DATA.DATASETS.TRAIN_CATEGORIES)
            for name in cfg.DATASETS.TRAIN
        ]
        [AdhocDatasetManager.add(new_ds) for new_ds in new_train_datasets]
        with temp_defrost(cfg):
            cfg.DATASETS.TRAIN = tuple(ds.new_ds_name
                                       for ds in new_train_datasets)

    if cfg.D2GO_DATA.DATASETS.TEST_CATEGORIES:
        new_test_datasets = [
            COCOWithClassesToUse(ds, cfg.D2GO_DATA.DATASETS.TEST_CATEGORIES)
            for ds in cfg.DATASETS.TEST
        ]
        [AdhocDatasetManager.add(new_ds) for new_ds in new_test_datasets]
        with temp_defrost(cfg):
            cfg.DATASETS.TEST = tuple(ds.new_ds_name
                                      for ds in new_test_datasets)

    return cfg
    def before_train_callback(trainer):
        if not cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON:
            return

        new_cfg = cfg.clone()
        with temp_defrost(new_cfg):
            new_cfg.DATASETS.TRAIN = cfg.MODEL.KMEANS_ANCHORS.DATASETS
            data_loader = runner.build_detection_train_loader(new_cfg)

        anchors = compute_kmeans_anchors(cfg, data_loader)
        anchors = anchors.tolist()

        assert isinstance(trainer.model, GeneralizedRCNN)
        assert isinstance(trainer.model.proposal_generator, RPN)
        anchor_generator = trainer.model.proposal_generator.anchor_generator
        assert isinstance(anchor_generator, KMeansAnchorGenerator)
        anchor_generator.update_cell_anchors(anchors)
Exemple #11
0
def maybe_subsample_n_images(cfg, is_train=False):
    """
    Create a new config whose train/test datasets only take a subsample of
    `max_images` image. Use all images (non-op) when `max_images` <= 0.
    """
    max_images = cfg.D2GO_DATA.TEST.MAX_IMAGES
    sampling = cfg.D2GO_DATA.TEST.SUBSET_SAMPLING
    with contextlib.ExitStack() as stack:  # python 3.3+
        new_splits = tuple(
            stack.enter_context(
                register_sub_dataset_with_n_images(ds, max_images, sampling))
            for ds in (cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST))
        new_cfg = cfg.clone()
        with temp_defrost(new_cfg):
            if is_train:
                new_cfg.DATASETS.TRAIN = new_splits
            else:
                new_cfg.DATASETS.TEST = new_splits
        yield new_cfg
Exemple #12
0
def _setup_after_launch(cfg: CN, output_dir: str, runner):
    """
    Set things up after entering DDP, including
        - creating working directory
        - setting up logger
        - logging environment
        - initializing runner
    """
    create_dir_on_global_main_process(output_dir)
    comm.synchronize()
    setup_loggers(output_dir)
    cfg.freeze()
    if cfg.OUTPUT_DIR != output_dir:
        with temp_defrost(cfg):
            logger.warning(
                "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".format(
                    cfg.OUTPUT_DIR, output_dir
                )
            )
            cfg.OUTPUT_DIR = output_dir
    dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))
Exemple #13
0
def do_train(cfg: CfgNode, trainer: pl.Trainer,
             task: GeneralizedRCNNTask) -> Dict[str, str]:
    """Runs the training loop with given trainer and task.

    Args:
        cfg: The normalized ConfigNode for this D2Go Task.
        trainer: PyTorch Lightning trainer.
        task: Lightning module instance.

    Returns:
        A map of model name to trained model config path.
    """
    with EventStorage() as storage:
        task.storage = storage
        trainer.fit(task)
        final_ckpt = os.path.join(cfg.OUTPUT_DIR, FINAL_MODEL_CKPT)
        trainer.save_checkpoint(final_ckpt)  # for validation monitor

        trained_cfg = cfg.clone()
        with temp_defrost(trained_cfg):
            trained_cfg.MODEL.WEIGHTS = final_ckpt
        model_configs = dump_trained_model_configs(
            cfg.OUTPUT_DIR, {"model_final": trained_cfg})
    return model_configs
Exemple #14
0
    def do_train(self, cfg, model, resume):
        add_print_flops_callback(cfg, model, disable_after_callback=True)

        optimizer = self.build_optimizer(cfg, model)
        scheduler = self.build_lr_scheduler(cfg, optimizer)

        checkpointer = self.build_checkpointer(
            cfg,
            model,
            save_dir=cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS,
                                                 resume=resume)
        start_iter = (checkpoint.get("iteration", -1)
                      if resume and checkpointer.has_checkpoint() else -1)
        # The checkpoint stores the training iteration that just finished, thus we start
        # at the next iteration (or iter zero if there's no checkpoint).
        start_iter += 1
        max_iter = cfg.SOLVER.MAX_ITER
        periodic_checkpointer = PeriodicCheckpointer(
            checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

        data_loader = self.build_detection_train_loader(cfg)

        def _get_model_with_abnormal_checker(model):
            if not cfg.ABNORMAL_CHECKER.ENABLED:
                return model

            tbx_writer = _get_tbx_writer(
                get_tensorboard_log_dir(cfg.OUTPUT_DIR))
            writers = abnormal_checker.get_writers(cfg, tbx_writer)
            checker = abnormal_checker.AbnormalLossChecker(start_iter, writers)
            ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker)
            return ret

        trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
            _get_model_with_abnormal_checker(model), data_loader, optimizer)
        trainer_hooks = [
            hooks.IterationTimer(),
            model_ema.EMAHook(cfg, model) if cfg.MODEL_EMA.ENABLED else None,
            self._create_after_step_hook(cfg, model, optimizer, scheduler,
                                         periodic_checkpointer),
            hooks.EvalHook(
                cfg.TEST.EVAL_PERIOD,
                lambda: self.do_test(cfg, model, train_iter=trainer.iter),
            ),
            kmeans_anchors.compute_kmeans_anchors_hook(self, cfg),
            self._create_qat_hook(cfg)
            if cfg.QUANTIZATION.QAT.ENABLED else None,
        ]

        if comm.is_main_process():
            tbx_writer = _get_tbx_writer(
                get_tensorboard_log_dir(cfg.OUTPUT_DIR))
            writers = [
                CommonMetricPrinter(max_iter),
                JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                tbx_writer,
            ]
            trainer_hooks.append(hooks.PeriodicWriter(writers))
        trainer.register_hooks(trainer_hooks)
        trainer.train(start_iter, max_iter)

        if hasattr(self, 'original_cfg'):
            table = get_cfg_diff_table(cfg, self.original_cfg)
            logger.info(
                "GeneralizeRCNN Runner ignoring training config change: \n" +
                table)
            trained_cfg = self.original_cfg.clone()
        else:
            trained_cfg = cfg.clone()
        with temp_defrost(trained_cfg):
            trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file()
        return {"model_final": trained_cfg}
Exemple #15
0
    def do_train(self, cfg, model, resume):
        # Note that flops at the beginning of training is often inaccurate,
        # if a model has input-dependent logic
        attach_profilers(cfg, model)

        optimizer = self.build_optimizer(cfg, model)
        scheduler = self.build_lr_scheduler(cfg, optimizer)

        checkpointer = self.build_checkpointer(
            cfg,
            model,
            save_dir=cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS,
                                                 resume=resume)
        start_iter = (checkpoint.get("iteration", -1)
                      if resume and checkpointer.has_checkpoint() else -1)
        # The checkpoint stores the training iteration that just finished, thus we start
        # at the next iteration (or iter zero if there's no checkpoint).
        start_iter += 1
        max_iter = cfg.SOLVER.MAX_ITER
        periodic_checkpointer = PeriodicCheckpointer(
            checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

        data_loader = self.build_detection_train_loader(cfg)

        def _get_model_with_abnormal_checker(model):
            if not cfg.ABNORMAL_CHECKER.ENABLED:
                return model

            tbx_writer = self.get_tbx_writer(cfg)
            writers = abnormal_checker.get_writers(cfg, tbx_writer)
            checker = abnormal_checker.AbnormalLossChecker(start_iter, writers)
            ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker)
            return ret

        trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
            _get_model_with_abnormal_checker(model), data_loader, optimizer)
        trainer_hooks = self._get_trainer_hooks(cfg, model, optimizer,
                                                scheduler,
                                                periodic_checkpointer, trainer)

        if comm.is_main_process():
            tbx_writer = self.get_tbx_writer(cfg)
            writers = [
                CommonMetricPrinter(max_iter),
                JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                tbx_writer,
            ]
            trainer_hooks.append(hooks.PeriodicWriter(writers))
        update_hooks_from_registry(trainer_hooks)
        trainer.register_hooks(trainer_hooks)
        trainer.train(start_iter, max_iter)

        if hasattr(self, "original_cfg"):
            table = get_cfg_diff_table(cfg, self.original_cfg)
            logger.info(
                "GeneralizeRCNN Runner ignoring training config change: \n" +
                table)
            trained_cfg = self.original_cfg.clone()
        else:
            trained_cfg = cfg.clone()
        with temp_defrost(trained_cfg):
            trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file()
        return {"model_final": trained_cfg}