def _get_sampler(train_set, test_set, val_set, train_sampler, test_sampler, val_sampler, start_epoch): if train_sampler is None: if is_distributed(): train_sampler = DistributedSampler( train_set, num_replicas=get_world_size(), rank=get_global_rank()) train_sampler.set_epoch(start_epoch) else: train_sampler = RandomSampler(train_set, True) else: train_sampler = train_sampler(train_set) if test_sampler is None: if is_distributed(): test_sampler = DistributedSampler( test_set, num_replicas=get_world_size(), rank=get_global_rank()) else: test_sampler = test_sampler(test_set) if val_set is not None: if val_sampler is None and is_distributed(): val_sampler = DistributedSampler(val_set, num_replicas=get_world_size(), rank=get_global_rank()) val_sampler.set_epoch(start_epoch) elif val_sampler is not None: val_sampler = val_sampler(val_set) return train_sampler, test_sampler, val_sampler
def main(cfg: Config): if cfg.enable_accimage: enable_accimage() model = resnet50() optimizer = optim.SGD(lr=1e-1 * cfg.batch_size * get_num_nodes() / 256, momentum=0.9, weight_decay=1e-4) scheduler = lr_scheduler.MultiStepLR([30, 60, 80]) train_loader, test_loader = DATASET_REGISTRY("fast_imagenet" if cfg.use_fast_collate else "imagenet")(cfg.batch_size, train_size=cfg.batch_size * 50 if cfg.debug else None, test_size=cfg.batch_size * 50 if cfg.debug else None, num_workers=cfg.num_workers) use_multi_gpus = not is_distributed() and torch.cuda.device_count() > 1 with SupervisedTrainer(model, optimizer, F.cross_entropy, reporters=[reporters.TensorboardReporter(".")], scheduler=scheduler, data_parallel=use_multi_gpus, use_amp=cfg.use_amp, use_cuda_nonblocking=True, use_sync_bn=cfg.use_sync_bn, report_accuracy_topk=5) as trainer: for epoch in trainer.epoch_range(cfg.epochs): trainer.train(train_loader) trainer.test(test_loader) print(f"Max Test Accuracy={max(trainer.reporter.history('accuracy/test')):.3f}")
def main(): if is_distributed(): init_distributed() model = se_resnet50(num_classes=1000) optimizer = optim.SGD(lr=0.6 / 1024 * args.batch_size, momentum=0.9, weight_decay=1e-4) scheduler = lr_scheduler.MultiStepLR([50, 70]) train_loader, test_loader = DATASET_REGISTRY("imagenet")(args.batch_size) c = [ callbacks.AccuracyCallback(), callbacks.AccuracyCallback(k=5), callbacks.LossCallback(), callbacks.WeightSave("."), reporters.TensorboardReporter("."), reporters.TQDMReporter(range(args.epochs)), ] with SupervisedTrainer( model, optimizer, F.cross_entropy, callbacks=c, scheduler=scheduler, ) as trainer: for _ in c[-1]: trainer.train(train_loader) trainer.test(test_loader)
def set_batch_size(self, batch_size: int) -> None: if is_distributed(): _batch_size = torch.empty( 1, dtype=torch.int, device=torch.device( torch.cuda.current_device())).fill_(batch_size) distributed.all_reduce(_batch_size, op=distributed.ReduceOp.SUM) batch_size = _batch_size.item() self._total_size += batch_size
def __init__(self, key: str, reduction: str or Callable, no_sync: bool) -> None: self._key = key if isinstance(reduction, str) and reduction not in {'sum', 'average'}: raise ValueError( f"reduction is expected to be 'sum' or 'average', but got {reduction}." ) self._reduction = reduction self._sync = not no_sync and is_distributed() self._total_size: int = 0 self._memory: List[Any] = []
def distributed_ready_main(func: Callable = None, backend: Optional[str] = None, init_method: Optional[str] = None, disable_distributed_print: str = False) -> Callable: """ Wrap a main function to make it distributed ready """ if is_distributed(): init_distributed(backend=backend, init_method=init_method, disable_distributed_print=disable_distributed_print) @wraps(func) def inner(*args, **kwargs): return func(*args, **kwargs) return inner
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]], *, callbacks: Optional[Iterable[Callback]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if logger is None: logger = get_logger(__name__) self.logger = logger if device is None: self.device = torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU) else: self.device = device if use_horovod and not is_horovod_available(): raise RuntimeError('horovod is not available!') if is_distributed(): if use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module] but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.optimizer = None self.scheduler = None self._callbacks = None self.update_scheduler_by_epoch = update_scheduler_by_epoch self._set_optimizer(optimizer) self._set_scheduler(scheduler) self._set_callbacks(callbacks) if use_horovod: import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) self.loss_f = loss_f self._verb = verb # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch! self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) if verb else lambda x: x _map_base = { MODEL: self.accessible_model, OPTIMIZER: self.optimizer, SCHEDULER: self.scheduler, TRAINER: self } self._iteration_map = Map(**_map_base.copy()) self._epoch_map = Map(**_map_base.copy()) self._all_map = Map(**_map_base.copy()) for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self._callbacks.before_all(self._all_map)
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]] = None, *, reporters: Optional[_ReporterBase or List[_ReporterBase]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if kwargs.get("callbacks"): raise DeprecationWarning( "callback is deprecated, if you need, use homura before v2020.8" ) self.logger = logger or get_logger(__name__) self.device = device or (torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU)) # setup for distributed self._use_sync_bn = use_sync_bn if is_distributed(): if self._use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.logger.info( "BNs of model are converted to nn.SyncBatchNorm") rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False # setup model if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) # self.accessible_model is useful for e.g., checkpointing if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.loss_f = loss_f self._verb = verb # setup optimizer and scheduler self.optimizer = optimizer self.scheduler = scheduler self._update_scheduler_by_epoch = update_scheduler_by_epoch self.set_optimizer() self.set_scheduler() if use_horovod: if not is_horovod_available(): raise RuntimeError("horovod is not available!") import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) if reporters is not None and not isinstance(reporters, Iterable): reporters = [reporters] reporters = reporters or [] if not any([isinstance(rep, TQDMReporter) for rep in reporters]): # if reporters not contain TQDMReporter reporters.append(TQDMReporter(ncols=tqdm_ncols)) self.reporter = ReporterList(reporters) # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if verb: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) _set_tqdm_print() for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self.logger.debug(f"trainer sets {k} as a new attribute")
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]] = None, *, reporters: Optional[_ReporterBase or List[_ReporterBase]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, device: Optional[torch.device or str] = None, quiet: bool = True, disable_cudnn_benchmark: bool = False, disable_cuda_nonblocking: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 120, debug: bool = False, **kwargs): if kwargs.get("update_scheduler_by_epoch"): raise DeprecationWarning( "update_scheduler_by_epoch is deprecated, users need to step") if kwargs.get("callbacks"): raise DeprecationWarning( "callback is deprecated, if you need, use homura before v2020.8" ) self.logger = logger or get_logger(__name__) self.device = device or (torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU)) self._is_debug = debug if self._is_debug: self.logger.warning( "Trainer is set to be debug mode, which may affect the performance" ) set_verb_level("debug") # setup for distributed self._use_sync_bn = use_sync_bn if is_distributed(): if self._use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.logger.info( "BNs of model are converted to nn.SyncBatchNorm") rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting quiet = True self.loss_f = loss_f self._verbose = not quiet # setup model if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) self.logger.debug(f"model is nn.ModuleDict of {self.model.keys()}") else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = not disable_cudnn_benchmark self._cuda_nonblocking = not disable_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {not disable_cudnn_benchmark}, " f"cuda.nonblocking: {not disable_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) self.logger.debug( f"model converted to DistributedDataParallel at rank={rank}") # self.accessible_model is useful for e.g., checkpointing if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model # setup optimizer and scheduler self.optimizer = optimizer self.scheduler = scheduler self.set_optimizer() self.set_scheduler() if reporters is not None and not isinstance(reporters, Iterable): reporters = [reporters] reporters = reporters or [] if not any([isinstance(rep, TQDMReporter) for rep in reporters]): # if reporters not contain TQDMReporter reporters.append(TQDMReporter(ncols=tqdm_ncols)) self.logger.debug(f"reporter is ready: {reporters}") self.reporter = ReporterList(reporters) # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if self._verbose: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) set_tqdm_stdout_stderr() self.logger.debug("verbose: setup tqdm") else: self.logger.debug("quiet: no tqdm") for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if isinstance(v, torch.Tensor): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self.logger.debug(f"trainer sets {k} as a new attribute")
def get_dataloader( self, batch_size: int, train_da: Optional[List] = None, test_da: Optional[List] = None, norm: Optional[List] = None, train_size: Optional[int] = None, test_size: Optional[int] = None, val_size: Optional[int] = None, download: bool = False, num_workers: int = 1, non_training_bs_factor=2, drop_last: bool = False, pin_memory: bool = True, return_num_classes: bool = False, test_batch_size: Optional[int] = None, pre_default_train_da: Optional[List] = None, post_default_train_da: Optional[List] = None, post_norm_train_da: Optional[List] = None, use_prefetcher: bool = False, start_epoch: bool = 0 ) -> (Tuple[DataLoader, DataLoader] or Tuple[DataLoader, DataLoader, DataLoader] or Tuple[DataLoader, DataLoader, int] or Tuple[DataLoader, DataLoader, DataLoader, int]): """ Get Dataloader. This will automatically handle distributed setting :param batch_size: Batch size :param train_da: Data Augmentation for training :param test_da: Data Augmentation for testing and validation :param norm: Normalization after train_da and test_da :param train_size: Size of training dataset. If None, full dataset will be available. :param test_size: Size of test dataset. If None, full dataset will be available. :param val_size: Size of validation dataset, randomly split from the training dataset. If None, None will be returned. :param download: If dataset needs downloading :param num_workers: Number of workers in data loaders :param non_training_bs_factor: Batch size scale factor during non training. For example, testing time requires no backward cache, so basically batch size can be doubled. :param drop_last: If drop last batch or not :param pin_memory: If pin memory or not :param return_num_classes: If return number of classes as the last return value :param test_batch_size: Test time batch size. If None, non_training_bs_factor * batch_size is used. :param pre_default_train_da: Data Augmentation before the default data augmentation :param post_default_train_da: Data Augmentation after the default data augmentation :param post_norm_train_da: Data Augmentation after normalization (i.e., norm) :param use_prefetcher: Use prefetcher or Not :param start_epoch: Epoch at start time :return: train_loader, test_loader, [val_loader], [num_classes] """ train_set, test_set, val_set = self.get_dataset( train_size, test_size, val_size, train_da, test_da, norm, download, pre_default_train_da=pre_default_train_da, post_default_train_da=post_default_train_da, post_norm_train_da=post_norm_train_da) if test_batch_size is None: test_batch_size = non_training_bs_factor * batch_size samplers = [None, None, None] if is_distributed(): import homura dist_sampler_kwargs = dict(num_replicas=homura.get_world_size(), rank=homura.get_global_rank()) samplers[0] = DistributedSampler(train_set, **dist_sampler_kwargs) samplers[2] = DistributedSampler(test_set, **dist_sampler_kwargs) samplers[0].set_epoch(start_epoch) samplers[2].set_epoch(start_epoch) else: samplers[0] = RandomSampler(train_set, True) shared_kwargs = dict(drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, collate_fn=self.collate_fn) train_loader = DataLoader(train_set, batch_size, sampler=samplers[0], **shared_kwargs) test_loader = DataLoader(test_set, test_batch_size, sampler=samplers[2], **shared_kwargs) if use_prefetcher: train_loader = DataPrefetchWrapper(train_loader, start_epoch) test_loader = DataPrefetchWrapper(test_loader, start_epoch) ret = [train_loader, test_loader] if val_set is not None: if is_distributed(): samplers[1] = DistributedSampler(val_set, **dist_sampler_kwargs) samplers[1].set_epoch(start_epoch) val_loader = DataLoader(val_set, test_batch_size, sampler=samplers[1], **shared_kwargs) if use_prefetcher: val_loader = DataPrefetchWrapper(test_loader) ret.append(val_loader) if return_num_classes: ret.append(self.num_classes) return tuple(ret)
def _process_tensor(self, value: Any) -> Any: if torch.is_tensor(value): if is_distributed() and not self._no_sync: distributed.all_reduce(value, op=distributed.ReduceOp.SUM) value = value.cpu() return value