def __init__(self, ncols: int = 80) -> None: self.writer = None self._ncols = ncols self._temporal_memory = {} liblog._set_tqdm_handler() liblog._set_tqdm_print()
def __init__(self, iterator: Iterable, verb: bool = False): super(TQDMReporter, self).__init__() self.writer = tqdm.tqdm(iterator, ncols=80) if is_master() else iterator self._verb = verb self._logger = liblog.get_logger(__name__) self._length = len(iterator) liblog._set_tqdm_handler() liblog._set_tqdm_print()
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]] = None, *, reporters: Optional[_ReporterBase or List[_ReporterBase]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if kwargs.get("callbacks"): raise DeprecationWarning( "callback is deprecated, if you need, use homura before v2020.8" ) self.logger = logger or get_logger(__name__) self.device = device or (torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU)) # setup for distributed self._use_sync_bn = use_sync_bn if is_distributed(): if self._use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.logger.info( "BNs of model are converted to nn.SyncBatchNorm") rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False # setup model if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) # self.accessible_model is useful for e.g., checkpointing if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.loss_f = loss_f self._verb = verb # setup optimizer and scheduler self.optimizer = optimizer self.scheduler = scheduler self._update_scheduler_by_epoch = update_scheduler_by_epoch self.set_optimizer() self.set_scheduler() if use_horovod: if not is_horovod_available(): raise RuntimeError("horovod is not available!") import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) if reporters is not None and not isinstance(reporters, Iterable): reporters = [reporters] reporters = reporters or [] if not any([isinstance(rep, TQDMReporter) for rep in reporters]): # if reporters not contain TQDMReporter reporters.append(TQDMReporter(ncols=tqdm_ncols)) self.reporter = ReporterList(reporters) # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if verb: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) _set_tqdm_print() for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self.logger.debug(f"trainer sets {k} as a new attribute")
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]], *, callbacks: Optional[Iterable[Callback]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if logger is None: logger = get_logger(__name__) self.logger = logger if device is None: self.device = torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU) else: self.device = device if use_horovod and not is_horovod_available(): raise RuntimeError('horovod is not available!') if is_distributed(): if use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module] but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.optimizer = optimizer self.scheduler = scheduler self._callbacks = callbacks self.update_scheduler_by_epoch = update_scheduler_by_epoch self.set_optimizer() self.set_scheduler() self._set_callbacks(callbacks) if use_horovod: import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) self.loss_f = loss_f self._verb = verb # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch! self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if verb: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) _set_tqdm_print() _map_base = { MODEL: self.accessible_model, OPTIMIZER: self.optimizer, SCHEDULER: self.scheduler, TRAINER: self } self._iteration_map = TensorMap(**_map_base.copy()) self._epoch_map = TensorMap(**_map_base.copy()) self._all_map = TensorMap(**_map_base.copy()) for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self._callbacks.before_all(self._all_map)