def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("herbarium") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) model = create_ddp_model(model, broadcast_buffers=False, find_unused_parameters=True) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, trainer=weakref.proxy(self), ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def build_general_test_loader(dataset, *, mapper, total_batch_size=1, sampler=None, num_workers=0): """ Similar to `build_detection_train_loader`, but uses a batch size of 1, and :class:`InferenceSampler`. This sampler coordinates all workers to produce the exact set of all samples. This interface is experimental. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a map-style pytorch dataset. They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, which splits the dataset across all workers. num_workers (int): number of parallel data loading workers Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if sampler is None: sampler = InferenceSampler(len(dataset)) world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size ) batch_size = total_batch_size // world_size # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, batch_size, drop_last=False) data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) return data_loader
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the herbarium logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode or omegaconf.DictConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir") if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, _highlight( PathManager.open(args.config_file, "r").read(), args.config_file), )) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") if isinstance(cfg, CfgNode): logger.info("Running with full config:\n{}".format( _highlight(cfg.dump(), ".yaml"))) with PathManager.open(path, "w") as f: f.write(cfg.dump()) else: LazyConfig.save(cfg, path) logger.info("Full config saved to {}".format(path)) # make sure each worker has a different, yet deterministic seed if specified seed = _try_get_key(cfg, "SEED", "train.seed", default=-1) seed_all_rng(None if seed < 0 else seed + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = _try_get_key(cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False)
def __init__(self, size: int): """ Args: size (int): the total number of data of the underlying dataset to sample from """ self._size = size assert size > 0 self._rank = comm.get_rank() self._world_size = comm.get_world_size() shard_size = (self._size - 1) // self._world_size + 1 begin = shard_size * self._rank end = min(shard_size * (self._rank + 1), self._size) self._local_indices = range(begin, end)
def build_batch_data_loader(dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0): """ Build a batched dataloader for training. Args: dataset (torch.utils.data.Dataset): map-style PyTorch dataset. Can be indexed. sampler (torch.utils.data.sampler.Sampler): a sampler that produces indices total_batch_size, aspect_ratio_grouping, num_workers): see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size) batch_size = total_batch_size // world_size if aspect_ratio_grouping: data_loader = torch.utils.data.DataLoader( dataset, sampler=sampler, num_workers=num_workers, batch_sampler=None, collate_fn=operator.itemgetter( 0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict return AspectRatioGroupedDataset(data_loader, batch_size) else: batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, batch_size, drop_last=True) # drop_last so the batch always have the same size return torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, worker_init_fn=worker_init_reset_seed, )
def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) B, C = input.shape[0], input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) if self._stats_mode == "": assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) momentum = self.momentum else: if B == 0: vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) vec = vec + input.sum( ) # make sure there is gradient w.r.t input else: vec = torch.cat([ mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype) ], dim=0) vec = AllReduce.apply(vec * B) total_batch = vec[-1].detach() momentum = total_batch.clamp( max=1) * self.momentum # no update if total_batch is 0 total_batch = torch.max( total_batch, torch.ones_like(total_batch)) # avoid div-by-zero mean, meansqr, _ = torch.split(vec / total_batch, C) var = meansqr - mean * mean invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) self.running_mean += momentum * (mean.detach() - self.running_mean) self.running_var += momentum * (var.detach() - self.running_var) return input * scale + bias
def __init__(self, cfg): """ Args: model, data_loader, optimizer: same as in :class:`SimpleTrainer`. grad_scaler: torch GradScaler to automatically scale gradients. """ super().__init__() logger = logging.getLogger("herbarium") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = build_model(cfg) optimizer = build_optimizer(cfg, model) controller = build_controller(cfg, model) train_data_loader = build_general_train_loader(cfg) # TODO: Need to change here for validation dataset loader val_data_loader = train_data_loader model = create_ddp_model(model, broadcast_buffers=False, find_unused_parameters=True) self._trainer = HierarchyTrainLoop( model, controller, train_data_loader, val_data_loader, optimizer, ) self.scheduler = build_lr_scheduler(cfg, optimizer) self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, trainer=weakref.proxy(self), ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def main(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model, resume=args.resume) return do_test(cfg, model)
def create_ddp_model(model, *, fp16_compression=False, **kwargs): """ Create a DistributedDataParallel model if there are >1 processes. Args: model: a torch.nn.Module fp16_compression: add fp16 compression hooks to the ddp object. See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`. """ # noqa if comm.get_world_size() == 1: return model if "device_ids" not in kwargs: kwargs["device_ids"] = [comm.get_local_rank()] ddp = DistributedDataParallel(model, **kwargs) if fp16_compression: from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook) return ddp
def __init__(self, repeat_factors, *, shuffle=True, seed=None): """ Args: repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``. shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() # Split into whole number (_int_part) and fractional (_frac_part) parts. self._int_part = torch.trunc(repeat_factors) self._frac_part = repeat_factors - self._int_part
def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._size = size assert size > 0 self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size()
def inference_on_dataset(model, data_loader, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.__call__` accurately. The model will be used in eval mode. Args: model (callable): a callable which takes an object from `data_loader` and returns some outputs. If it's an nn.Module, it will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with ExitStack() as stack: if isinstance(model, nn.Module): stack.enter_context(inference_context(model)) stack.enter_context(torch.no_grad()) for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results