def _write_metrics(self, metrics_dict: dict): """ Args: metrics_dict (dict): dict of scalar metrics """ metrics_dict = { k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v) for k, v in metrics_dict.items() } # gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in core. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): if "data_time" in all_metrics_dict[0]: # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max( [x.pop("data_time") for x in all_metrics_dict]) self.storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(loss for loss in metrics_dict.values()) self.storage.put_scalar("total_loss", total_losses_reduced) if len(metrics_dict) > 1: self.storage.put_scalars(**metrics_dict)
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed_all_rng() # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" # verify_results(self.cfg, self._last_eval_results) return self._last_eval_results
def test(cls, cfg, model, evaluators=None): """ Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as `cfg.DATASETS.TEST`. Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len( cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators)) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TESTS): logger.info("Prepare testing set") data_loader, num_query = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, num_query) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method.") results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results) print_csv_format(results) if len(results) == 1: results = list(results.values())[0] return results
def cache_url(url, model_dir=None, progress=True): r"""Loads the Torch serialized object at the given URL. If the object is already present in `model_dir`, it's deserialized and returned. The filename part of the URL should follow the naming convention ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. The default value of `model_dir` is ``$TORCH_HOME/models`` where ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be overridden with the ``$TORCH_MODEL_ZOO`` environment variable. Args: url (string): URL of the object to download model_dir (string, optional): directory in which to save the object progress (bool, optional): whether or not to display a progress bar to stderr Example: >>> cached_file = core.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') """ if model_dir is None: torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) if not os.path.exists(model_dir): os.makedirs(model_dir) parts = urlparse(url) if parts.fragment != "": filename = parts.fragment else: filename = os.path.basename(parts.path) if filename == "model_final.pkl": # workaround as pre-trained Caffe2 models from Detectron have all the same filename # so make the full path the filename by replacing / with _ filename = parts.path.replace("/", "_") cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file) and is_main_process(): sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) hash_prefix = HASH_REGEX.search(filename) if hash_prefix is not None: hash_prefix = hash_prefix.group(1) # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, # which matches the hash PyTorch uses. So we skip the hash matching # if the hash_prefix is less than 6 characters if len(hash_prefix) < 6: hash_prefix = None _download_url_to_file(url, cached_file, hash_prefix, progress=progress) synchronize() return cached_file
def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("core") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for core setup_logger() # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True` # for part of the parameters is not updated. model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 if cfg.SOLVER.SWA.ENABLED: self.max_iter = cfg.SOLVER.MAX_ITER + cfg.SOLVER.SWA.ITER else: self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def build_reid_test_loader(cfg, dataset_name): test_transforms = build_transforms(cfg, is_train=False) dataset = DATASET_REGISTRY.get(dataset_name)(root=_root) if comm.is_main_process(): dataset.show_test() test_items = dataset.query + dataset.gallery test_set = CommDataset(test_items, test_transforms, relabel=False) batch_size = cfg.TEST.IMS_PER_BATCH data_sampler = samplers.InferenceSampler(len(test_set)) batch_sampler = torch.utils.data.BatchSampler(data_sampler, batch_size, False) test_loader = DataLoader( test_set, batch_sampler=batch_sampler, num_workers=4, # save some memory collate_fn=fast_batch_collator) return test_loader, len(dataset.query)
def build_reid_train_loader(cfg): train_transforms = build_transforms(cfg, is_train=True) train_items = list() for d in cfg.DATASETS.NAMES: dataset = DATASET_REGISTRY.get(d)(root=_root, combineall=cfg.DATASETS.COMBINEALL) if comm.is_main_process(): dataset.show_train() train_items.extend(dataset.train) train_set = CommDataset(train_items, train_transforms, relabel=True) num_workers = cfg.DATALOADER.NUM_WORKERS mini_batch_size = cfg.SOLVER.IMS_PER_BATCH num_instance = cfg.DATALOADER.NUM_INSTANCE global_batch_size = mini_batch_size * comm.get_world_size() if cfg.DATALOADER.PK_SAMPLER: if cfg.DATALOADER.NAIVE_WAY: data_sampler = samplers.NaiveIdentitySampler( train_set.img_items, global_batch_size, num_instance) else: data_sampler = samplers.BalancedIdentitySampler( train_set.img_items, global_batch_size, num_instance) else: data_sampler = samplers.TrainingSampler(len(train_set)) batch_sampler = torch.utils.data.sampler.BatchSampler( data_sampler, mini_batch_size, True) train_loader = torch.utils.data.DataLoader( train_set, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=fast_batch_collator, ) return train_loader
def test_and_save_results(): if comm.is_main_process(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results else: return None
def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ logger = logging.getLogger(__name__) cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN cfg.DATASETS.NAMES = tuple([cfg.TEST.PRECISE_BN.DATASET ]) # set dataset name for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), ] if cfg.SOLVER.SWA.ENABLED: ret.append( hooks.SWA( cfg.SOLVER.MAX_ITER, cfg.SOLVER.SWA.PERIOD, cfg.SOLVER.SWA.LR_FACTOR, cfg.SOLVER.SWA.ETA_MIN_LR, cfg.SOLVER.SWA.LR_SCHED, )) if cfg.TEST.PRECISE_BN.ENABLED and hooks.get_bn_modules(self.model): logger.info("Prepare precise BN dataset") ret.append( hooks.PreciseBN( # Run at the same freq as (but before) evaluation. self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, )) if cfg.MODEL.FREEZE_LAYERS != [''] and cfg.SOLVER.FREEZE_ITERS > 0: freeze_layers = ",".join(cfg.MODEL.FREEZE_LAYERS) logger.info( f'Freeze layer group "{freeze_layers}" training for {cfg.SOLVER.FREEZE_ITERS:d} iterations' ) ret.append( hooks.FreezeLayer( self.model, self.optimizer, cfg.MODEL.FREEZE_LAYERS, cfg.SOLVER.FREEZE_ITERS, )) # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): if comm.is_main_process(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results else: return None # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), 200)) return ret