def test_build_model(self, tmp_dir): cfg = self._get_cfg(tmp_dir) cfg.MODEL_EMA.ENABLED = True task = GeneralizedRCNNTask(cfg) trainer = self._get_trainer(tmp_dir) with EventStorage() as storage: task.storage = storage trainer.fit(task) # test building untrained model model = GeneralizedRCNNTask.build_model(cfg) self.assertTrue(model.training) # test loading regular weights with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertFalse(model.training) self.assertTrue( self._compare_state_dict(model.state_dict(), task.model.state_dict())) # test loading EMA weights with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") cfg.MODEL_EMA.USE_EMA_WEIGHTS_FOR_EVAL_ONLY = True model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertFalse(model.training) self.assertTrue( self._compare_state_dict(model.state_dict(), task.ema_state.state_dict()))
def setup_after_launch(cfg, output_dir, runner): """ Set things up after entering DDP, including - creating working directory - setting up logger - logging environment - initializing runner """ create_dir_on_global_main_process(output_dir) comm.synchronize() setup_loggers(output_dir) cfg.freeze() if cfg.OUTPUT_DIR != output_dir: with temp_defrost(cfg): logger.warning( "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}". format(cfg.OUTPUT_DIR, output_dir)) cfg.OUTPUT_DIR = output_dir logger.info("Initializing runner ...") runner = initialize_runner(runner, cfg) log_info(cfg, runner) dump_cfg(cfg, os.path.join(output_dir, "config.yaml")) auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
def maybe_override_output_dir(cfg: CfgNode, output_dir: str): if cfg.OUTPUT_DIR != output_dir: with temp_defrost(cfg): logger.warning( "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}". format(cfg.OUTPUT_DIR, output_dir)) cfg.OUTPUT_DIR = output_dir
def launch( main_func, num_processes_per_machine, num_machines=1, machine_rank=0, dist_url=None, backend="NCCL", always_spawn=False, args=(), ): logger.info( f"Launch with num_processes_per_machine: {num_processes_per_machine}," f" num_machines: {num_machines}, machine_rank: {machine_rank}," f" dist_url: {dist_url}, backend: {backend}.") if get_launch_environment() == "local" and not torch.cuda.is_available(): assert len(args) > 0, args cfg = args[0] assert isinstance(cfg, CfgNode) if cfg.MODEL.DEVICE == "cuda": logger.warning( "Detected that CUDA is not available on this machine, set MODEL.DEVICE" " to cpu and backend to GLOO") with temp_defrost(cfg): cfg.MODEL.DEVICE = "cpu" backend = "GLOO" if backend == "NCCL": assert ( num_processes_per_machine <= torch.cuda.device_count() ), "num_processes_per_machine is greater than device count: {} vs {}".format( num_processes_per_machine, torch.cuda.device_count()) world_size = num_machines * num_processes_per_machine if world_size > 1 or always_spawn: # https://github.com/pytorch/pytorch/pull/14391 # TODO prctl in spawned processes prefix = f"detectron2go_{main_func.__module__}.{main_func.__name__}_return" with tempfile.NamedTemporaryFile(prefix=prefix, suffix=".pth") as f: return_file = f.name mp.spawn( _distributed_worker, nprocs=num_processes_per_machine, args=( main_func, world_size, num_processes_per_machine, machine_rank, dist_url, backend, return_file, args, ), daemon=False, ) if machine_rank == 0: return torch.load(return_file) else: return main_func(*args)
def test_qat(self, tmp_dir): @META_ARCH_REGISTRY.register() class QuantizableDetMetaArchForTest(mah.DetMetaArchForTest): custom_config_dict = {"preserved_attributes": ["preserved_attr"]} def __init__(self, cfg): super().__init__(cfg) self.avgpool.preserved_attr = "foo" self.avgpool.not_preserved_attr = "bar" def prepare_for_quant(self, cfg): example_inputs = (torch.rand(1, 3, 3, 3), ) self.avgpool = prepare_qat_fx( self.avgpool, { "": set_backend_and_create_qconfig(cfg, is_train=self.training) }, example_inputs, self.custom_config_dict, ) return self def prepare_for_quant_convert(self, cfg): self.avgpool = convert_fx( self.avgpool, convert_custom_config_dict=self.custom_config_dict) return self cfg = self._get_cfg(tmp_dir) cfg.MODEL.META_ARCHITECTURE = "QuantizableDetMetaArchForTest" cfg.QUANTIZATION.QAT.ENABLED = True task = GeneralizedRCNNTask(cfg) callbacks = [ QuantizationAwareTraining.from_config(cfg), ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR, save_last=True), ] trainer = pl.Trainer( max_steps=1, limit_train_batches=1, num_sanity_val_steps=0, callbacks=callbacks, logger=False, ) with EventStorage() as storage: task.storage = storage trainer.fit(task) prepared_avgpool = task._prepared.model.avgpool self.assertEqual(prepared_avgpool.preserved_attr, "foo") self.assertFalse(hasattr(prepared_avgpool, "not_preserved_attr")) with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertTrue(isinstance(model.avgpool, torch.fx.GraphModule))
def main( cfg, output_dir, runner, # binary specific optional arguments predictor_types: typing.List[str], device: str = "cpu", compare_accuracy: bool = False, skip_if_fail: bool = False, ): if compare_accuracy: raise NotImplementedError( "compare_accuracy functionality isn't currently supported.") # NOTE: dict for metrics of all exported models (and original pytorch model) # ret["accuracy_comparison"] = accuracy_comparison cfg = copy.deepcopy(cfg) setup_after_launch(cfg, output_dir, runner) with temp_defrost(cfg): cfg.merge_from_list(["MODEL.DEVICE", device]) model = runner.build_model(cfg, eval_only=True) # NOTE: train dataset is used to avoid leakage since the data might be used for # running calibration for quantization. test_loader is used to make sure it follows # the inference behaviour (augmentation will not be applied). datasets = list(cfg.DATASETS.TRAIN) data_loader = runner.build_detection_test_loader(cfg, datasets) logger.info("Running the pytorch model and print FLOPS ...") first_batch = next(iter(data_loader)) input_args = (first_batch, ) flops_utils.print_model_flops(model, input_args) predictor_paths: typing.Dict[str, str] = {} for typ in predictor_types: # convert_and_export_predictor might alter the model, copy before calling it pytorch_model = copy.deepcopy(model) try: predictor_path = convert_and_export_predictor( cfg, pytorch_model, typ, output_dir, data_loader, ) logger.info( f"Predictor type {typ} has been exported to {predictor_path}") predictor_paths[typ] = predictor_path except Exception as e: logger.exception(f"Export {typ} predictor failed: {e}") if not skip_if_fail: raise e ret = {"predictor_paths": predictor_paths, "accuracy_comparison": {}} return ret
def update_cfg_from_pb_model(cfg, model): """ Update cfg statically based given caffe2 model, in cast that there's conflict between caffe2 model and the cfg, caffe2 model has higher priority. """ with temp_defrost(cfg): _update_if_true(cfg, "MODEL.MASK_ON", infer_mask_on(model)) _update_if_true(cfg, "MODEL.KEYPOINT_ON", infer_keypoint_on(model)) _update_if_true(cfg, "MODEL.DENSEPOSE_ON", infer_densepose_on(model)) return cfg
def test_build_model(self, tmp_dir): cfg = self._get_cfg(tmp_dir) cfg.MODEL_EMA.ENABLED = True task = GeneralizedRCNNTask(cfg) checkpoint_callback = ModelCheckpoint(dirpath=task.cfg.OUTPUT_DIR, save_last=True) trainer = pl.Trainer( max_steps=1, limit_train_batches=1, num_sanity_val_steps=0, callbacks=[checkpoint_callback], ) with EventStorage() as storage: task.storage = storage trainer.fit(task) # test building untrained model model = GeneralizedRCNNTask.build_model(cfg) self.assertTrue(model.training) # test loading regular weights with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertFalse(model.training) self.assertTrue( self._compare_state_dict(model.state_dict(), task.model.state_dict())) # test loading EMA weights with temp_defrost(cfg): cfg.MODEL.WEIGHTS = os.path.join(tmp_dir, "last.ckpt") cfg.MODEL_EMA.USE_EMA_WEIGHTS_FOR_EVAL_ONLY = True model = GeneralizedRCNNTask.build_model(cfg, eval_only=True) self.assertFalse(model.training) self.assertTrue( self._compare_state_dict(model.state_dict(), task.ema_state.state_dict()))
def update_cfg_if_using_adhoc_dataset(cfg): if cfg.D2GO_DATA.DATASETS.TRAIN_CATEGORIES: new_train_datasets = [ COCOWithClassesToUse(name, cfg.D2GO_DATA.DATASETS.TRAIN_CATEGORIES) for name in cfg.DATASETS.TRAIN ] [AdhocDatasetManager.add(new_ds) for new_ds in new_train_datasets] with temp_defrost(cfg): cfg.DATASETS.TRAIN = tuple(ds.new_ds_name for ds in new_train_datasets) if cfg.D2GO_DATA.DATASETS.TEST_CATEGORIES: new_test_datasets = [ COCOWithClassesToUse(ds, cfg.D2GO_DATA.DATASETS.TEST_CATEGORIES) for ds in cfg.DATASETS.TEST ] [AdhocDatasetManager.add(new_ds) for new_ds in new_test_datasets] with temp_defrost(cfg): cfg.DATASETS.TEST = tuple(ds.new_ds_name for ds in new_test_datasets) return cfg
def before_train_callback(trainer): if not cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON: return new_cfg = cfg.clone() with temp_defrost(new_cfg): new_cfg.DATASETS.TRAIN = cfg.MODEL.KMEANS_ANCHORS.DATASETS data_loader = runner.build_detection_train_loader(new_cfg) anchors = compute_kmeans_anchors(cfg, data_loader) anchors = anchors.tolist() assert isinstance(trainer.model, GeneralizedRCNN) assert isinstance(trainer.model.proposal_generator, RPN) anchor_generator = trainer.model.proposal_generator.anchor_generator assert isinstance(anchor_generator, KMeansAnchorGenerator) anchor_generator.update_cell_anchors(anchors)
def maybe_subsample_n_images(cfg, is_train=False): """ Create a new config whose train/test datasets only take a subsample of `max_images` image. Use all images (non-op) when `max_images` <= 0. """ max_images = cfg.D2GO_DATA.TEST.MAX_IMAGES sampling = cfg.D2GO_DATA.TEST.SUBSET_SAMPLING with contextlib.ExitStack() as stack: # python 3.3+ new_splits = tuple( stack.enter_context( register_sub_dataset_with_n_images(ds, max_images, sampling)) for ds in (cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST)) new_cfg = cfg.clone() with temp_defrost(new_cfg): if is_train: new_cfg.DATASETS.TRAIN = new_splits else: new_cfg.DATASETS.TEST = new_splits yield new_cfg
def _setup_after_launch(cfg: CN, output_dir: str, runner): """ Set things up after entering DDP, including - creating working directory - setting up logger - logging environment - initializing runner """ create_dir_on_global_main_process(output_dir) comm.synchronize() setup_loggers(output_dir) cfg.freeze() if cfg.OUTPUT_DIR != output_dir: with temp_defrost(cfg): logger.warning( "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".format( cfg.OUTPUT_DIR, output_dir ) ) cfg.OUTPUT_DIR = output_dir dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))
def do_train(cfg: CfgNode, trainer: pl.Trainer, task: GeneralizedRCNNTask) -> Dict[str, str]: """Runs the training loop with given trainer and task. Args: cfg: The normalized ConfigNode for this D2Go Task. trainer: PyTorch Lightning trainer. task: Lightning module instance. Returns: A map of model name to trained model config path. """ with EventStorage() as storage: task.storage = storage trainer.fit(task) final_ckpt = os.path.join(cfg.OUTPUT_DIR, FINAL_MODEL_CKPT) trainer.save_checkpoint(final_ckpt) # for validation monitor trained_cfg = cfg.clone() with temp_defrost(trained_cfg): trained_cfg.MODEL.WEIGHTS = final_ckpt model_configs = dump_trained_model_configs( cfg.OUTPUT_DIR, {"model_final": trained_cfg}) return model_configs
def do_train(self, cfg, model, resume): add_print_flops_callback(cfg, model, disable_after_callback=True) optimizer = self.build_optimizer(cfg, model) scheduler = self.build_lr_scheduler(cfg, optimizer) checkpointer = self.build_checkpointer( cfg, model, save_dir=cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) start_iter = (checkpoint.get("iteration", -1) if resume and checkpointer.has_checkpoint() else -1) # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). start_iter += 1 max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) data_loader = self.build_detection_train_loader(cfg) def _get_model_with_abnormal_checker(model): if not cfg.ABNORMAL_CHECKER.ENABLED: return model tbx_writer = _get_tbx_writer( get_tensorboard_log_dir(cfg.OUTPUT_DIR)) writers = abnormal_checker.get_writers(cfg, tbx_writer) checker = abnormal_checker.AbnormalLossChecker(start_iter, writers) ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker) return ret trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( _get_model_with_abnormal_checker(model), data_loader, optimizer) trainer_hooks = [ hooks.IterationTimer(), model_ema.EMAHook(cfg, model) if cfg.MODEL_EMA.ENABLED else None, self._create_after_step_hook(cfg, model, optimizer, scheduler, periodic_checkpointer), hooks.EvalHook( cfg.TEST.EVAL_PERIOD, lambda: self.do_test(cfg, model, train_iter=trainer.iter), ), kmeans_anchors.compute_kmeans_anchors_hook(self, cfg), self._create_qat_hook(cfg) if cfg.QUANTIZATION.QAT.ENABLED else None, ] if comm.is_main_process(): tbx_writer = _get_tbx_writer( get_tensorboard_log_dir(cfg.OUTPUT_DIR)) writers = [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_writer, ] trainer_hooks.append(hooks.PeriodicWriter(writers)) trainer.register_hooks(trainer_hooks) trainer.train(start_iter, max_iter) if hasattr(self, 'original_cfg'): table = get_cfg_diff_table(cfg, self.original_cfg) logger.info( "GeneralizeRCNN Runner ignoring training config change: \n" + table) trained_cfg = self.original_cfg.clone() else: trained_cfg = cfg.clone() with temp_defrost(trained_cfg): trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file() return {"model_final": trained_cfg}
def do_train(self, cfg, model, resume): # Note that flops at the beginning of training is often inaccurate, # if a model has input-dependent logic attach_profilers(cfg, model) optimizer = self.build_optimizer(cfg, model) scheduler = self.build_lr_scheduler(cfg, optimizer) checkpointer = self.build_checkpointer( cfg, model, save_dir=cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) start_iter = (checkpoint.get("iteration", -1) if resume and checkpointer.has_checkpoint() else -1) # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). start_iter += 1 max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) data_loader = self.build_detection_train_loader(cfg) def _get_model_with_abnormal_checker(model): if not cfg.ABNORMAL_CHECKER.ENABLED: return model tbx_writer = self.get_tbx_writer(cfg) writers = abnormal_checker.get_writers(cfg, tbx_writer) checker = abnormal_checker.AbnormalLossChecker(start_iter, writers) ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker) return ret trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( _get_model_with_abnormal_checker(model), data_loader, optimizer) trainer_hooks = self._get_trainer_hooks(cfg, model, optimizer, scheduler, periodic_checkpointer, trainer) if comm.is_main_process(): tbx_writer = self.get_tbx_writer(cfg) writers = [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_writer, ] trainer_hooks.append(hooks.PeriodicWriter(writers)) update_hooks_from_registry(trainer_hooks) trainer.register_hooks(trainer_hooks) trainer.train(start_iter, max_iter) if hasattr(self, "original_cfg"): table = get_cfg_diff_table(cfg, self.original_cfg) logger.info( "GeneralizeRCNN Runner ignoring training config change: \n" + table) trained_cfg = self.original_cfg.clone() else: trained_cfg = cfg.clone() with temp_defrost(trained_cfg): trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file() return {"model_final": trained_cfg}