def __init__(self, logger=None, save_dir=None, save_log: bool = False): super(LoggerWrapper, self).__init__(save_dir) from homura.liblog import get_logger, set_file_handler self.logger = get_logger( "homura.reporter") if logger is None else logger if save_log: set_file_handler(self._save_dir / "log.txt")
def __init__(self, iterator: Iterable, verb: bool = False): super(TQDMReporter, self).__init__() self.writer = tqdm.tqdm(iterator, ncols=80) if is_master() else iterator self._verb = verb self._logger = liblog.get_logger(__name__) self._length = len(iterator) liblog._set_tqdm_handler()
def __init__(self, iterator: Iterable, verb: bool = False): super(TQDMReporter, self).__init__() self.writer = tqdm.tqdm( iterator, dynamic_ncols=True) if is_master() else iterator self._verb = verb self._logger = liblog.get_logger(__name__) self._length = len(iterator) self._max_accuracy = -1.0 liblog._set_tqdm_handler() liblog._set_tqdm_print()
def __init__(self, metric: Callable[[Mapping], Any], name: str, logger=None, no_reduce: bool = False): if metric is not None: self.metric_function = metric self.metric_name = name self._last_iter = {} self._last_epoch = {} self._metrics_history = {} self._logger = get_logger(__name__) if logger is None else logger self._warning_flag = True self._no_reduce = no_reduce
def __init__(self, metric: Callable[[Mapping], Any], name: str, logger=None, reduction="average", no_reduce: bool = False): if metric is not None: self.metric_function = metric self.metric_name = name self._last_iter = {} self._last_epoch = {} self._metrics_history = {} self._logger = get_logger(__name__) if logger is None else logger self._no_reduce = no_reduce if reduction not in ("average", "sum"): raise RuntimeError( f"`reduction` should be 'average' or 'sum', but got {reduction} instead" ) self.reduction = reduction
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Optimizer or Dict[str, Optimizer] or torch.optim.Optimizer], loss_f: Optional[Callable or Dict[str, Callable]], *, callbacks: Optional[Callback or Iterable[Callable]] = None, scheduler: Optional[LRScheduler or Dict[LRScheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb=True, use_cudnn_benchmark=True, use_cuda_nonblocking=False, logger=None, **kwargs): if logger is None: logger = get_logger(__name__) super(TrainerBase, self).__init__(model, callbacks, device, use_cudnn_benchmark, use_cuda_nonblocking, logger, **kwargs) # set optimizer(s) if optimizer is None: self.optimizer = None elif isinstance(optimizer, Optimizer): self.optimizer = optimizer.set_model(self.model.parameters()) elif isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer elif isinstance(optimizer, dict): if not isinstance(model, dict): raise TypeError(f"model is not dict but optimizer is dict!") self.optimizer = StepDict(torch.optim.Optimizer) # self.model is nn.ModuleDict, then self.optimizer is StepDict for k, opt in optimizer.items(): m = self.model._modules.get(k) if m is None: raise KeyError(f"No such key {k} in model!") if opt is None: self.optimizer[k] = None elif isinstance(opt, Optimizer): self.optimizer[k] = opt.set_model(m.parameters()) else: raise TypeError(f"Unknown type: {type(opt)}") else: raise TypeError(f"Unknown type: {type(optimizer)}") self.logger.debug(f"Use optimizer: {self.optimizer.__class__.__name__}") # set scheduler(s) self.update_scheduler_by_epoch = update_scheduler_by_epoch self.update_scheduler(scheduler, update_scheduler_by_epoch) self.logger.debug(f"Use scheduler: {self.scheduler.__class__.__name__}") self.loss_f = loss_f self._verb = verb # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch! self._step = -1 self._epoch = -1 self._is_train = True _map_base = {MODEL: self.model, OPTIMIZER: self.optimizer, SCHEDULER: self.scheduler, TRAINER: self} self._iteration_map = Map(**_map_base.copy()) self._epoch_map = Map(**_map_base.copy()) self._all_map = Map(**_map_base.copy()) self._callbacks.before_all(self._all_map)
from __future__ import annotations import torch from torch import Tensor from homura.liblog import get_logger logger = get_logger(__name__) __all__ = [ "true_positive", "true_negative", "false_positive", "false_negative", "classwise_accuracy", "precision", "recall", "specificity", "f1_score", "confusion_matrix", "accuracy" ] def _base(input: Tensor, target: Tensor) -> tuple[Tensor, Tensor, Tensor]: classes = torch.arange(input.size(1), device=input.device) pred = input.argmax(dim=1).view(-1, 1) target = target.view(-1, 1) return pred, target, classes def true_positive(input: Tensor, target: Tensor) -> Tensor: """Calculate true positive :param input: output of network, expected to be `BxCx(OPTIONAL DIMENSIONS)` :param target: target, expected to be `Bx(OPTIONAL DIMENSIONS)` :return: true positive in float tensor of `C` """
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]], *, callbacks: Optional[Iterable[Callback]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if logger is None: logger = get_logger(__name__) self.logger = logger if device is None: self.device = torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU) else: self.device = device if use_horovod and not is_horovod_available(): raise RuntimeError('horovod is not available!') if is_distributed(): if use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module] but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.optimizer = None self.scheduler = None self._callbacks = None self.update_scheduler_by_epoch = update_scheduler_by_epoch self._set_optimizer(optimizer) self._set_scheduler(scheduler) self._set_callbacks(callbacks) if use_horovod: import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) self.loss_f = loss_f self._verb = verb # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch! self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) if verb else lambda x: x _map_base = { MODEL: self.accessible_model, OPTIMIZER: self.optimizer, SCHEDULER: self.scheduler, TRAINER: self } self._iteration_map = Map(**_map_base.copy()) self._epoch_map = Map(**_map_base.copy()) self._all_map = Map(**_map_base.copy()) for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self._callbacks.before_all(self._all_map)
""" Helper functions to get information about the environment. """ import importlib.util import os as python_os import subprocess import sys as python_sys from typing import Any, Optional from homura.liblog import get_logger logger = get_logger("homura.environment") # Utility functions that useful libraries are available or not def is_accimage_available() -> bool: return importlib.util.find_spec("accimage") is not None def enable_accimage() -> None: if is_accimage_available(): import torchvision torchvision.set_image_backend("accimage") logger.info("accimage is activated") else: logger.warning("accimage is not available") def is_faiss_available() -> bool: _faiss_available = importlib.util.find_spec("faiss") is not None
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]] = None, *, reporters: Optional[_ReporterBase or List[_ReporterBase]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, update_scheduler_by_epoch: bool = True, device: Optional[torch.device or str] = None, verb: bool = True, use_cudnn_benchmark: bool = True, use_cuda_nonblocking: bool = False, use_horovod: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 80, **kwargs): if kwargs.get("callbacks"): raise DeprecationWarning( "callback is deprecated, if you need, use homura before v2020.8" ) self.logger = logger or get_logger(__name__) self.device = device or (torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU)) # setup for distributed self._use_sync_bn = use_sync_bn if is_distributed(): if self._use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.logger.info( "BNs of model are converted to nn.SyncBatchNorm") rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting verb = False # setup model if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = use_cudnn_benchmark self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, " f"cuda.nonblocking: {use_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if not use_horovod and is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) # self.accessible_model is useful for e.g., checkpointing if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model self.loss_f = loss_f self._verb = verb # setup optimizer and scheduler self.optimizer = optimizer self.scheduler = scheduler self._update_scheduler_by_epoch = update_scheduler_by_epoch self.set_optimizer() self.set_scheduler() if use_horovod: if not is_horovod_available(): raise RuntimeError("horovod is not available!") import horovod.torch as hvd hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters()) if reporters is not None and not isinstance(reporters, Iterable): reporters = [reporters] reporters = reporters or [] if not any([isinstance(rep, TQDMReporter) for rep in reporters]): # if reporters not contain TQDMReporter reporters.append(TQDMReporter(ncols=tqdm_ncols)) self.reporter = ReporterList(reporters) # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if verb: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) _set_tqdm_print() for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self.logger.debug(f"trainer sets {k} as a new attribute")
""" Helper functions to make distributed training easy """ import builtins import os as python_os import warnings from functools import wraps from typing import Callable, Optional from torch import distributed from torch.cuda import device_count from homura.liblog import get_logger from .environment import get_args, get_environ logger = get_logger("homura.distributed") original_print = builtins.print def is_horovod_available() -> bool: warnings.warn("horovod is no longer supported by homura", DeprecationWarning) return False def is_distributed_available() -> bool: return distributed.is_available() def is_distributed() -> bool: """ Check if the process is distributed by checking the world size is larger than 1. """
from functools import partial from itertools import cycle from statistics import median from typing import Tuple, Mapping, Callable, Optional import torch from homura import optim, trainers, reporters, callbacks, Map, get_args, lr_scheduler from homura.liblog import get_logger from homura.modules import exponential_moving_average_, to_onehot from torch import nn from torch.nn import functional as F from backends.data import get_dataloader from backends.wrn import wrn28_2 logger = get_logger(__file__) class PackedLoader(object): def __init__(self, trusted_loader, untrusted_loader): self.l_loaders = trusted_loader self.u_loaders = untrusted_loader self._size = len(untrusted_loader) def __len__(self): return self._size def __iter__(self): for l, u in zip(cycle(self.l_loaders), self.u_loaders): yield list(l) + list(u)
def __init__(self, model: nn.Module or Dict[str, nn.Module], optimizer: Optional[Partial or Optimizer or Dict[str, Optimizer]], loss_f: Optional[Callable or Dict[str, Callable]] = None, *, reporters: Optional[_ReporterBase or List[_ReporterBase]] = None, scheduler: Optional[Partial or Scheduler or Dict[str, Scheduler]] = None, device: Optional[torch.device or str] = None, quiet: bool = True, disable_cudnn_benchmark: bool = False, disable_cuda_nonblocking: bool = False, logger=None, use_sync_bn: bool = False, tqdm_ncols: int = 120, debug: bool = False, **kwargs): if kwargs.get("update_scheduler_by_epoch"): raise DeprecationWarning( "update_scheduler_by_epoch is deprecated, users need to step") if kwargs.get("callbacks"): raise DeprecationWarning( "callback is deprecated, if you need, use homura before v2020.8" ) self.logger = logger or get_logger(__name__) self.device = device or (torch.device( GPU) if torch.cuda.is_available() else torch.device(CPU)) self._is_debug = debug if self._is_debug: self.logger.warning( "Trainer is set to be debug mode, which may affect the performance" ) set_verb_level("debug") # setup for distributed self._use_sync_bn = use_sync_bn if is_distributed(): if self._use_sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) self.logger.info( "BNs of model are converted to nn.SyncBatchNorm") rank = get_local_rank() torch.cuda.set_device(rank) if get_global_rank() > 0: # to avoid overwriting quiet = True self.loss_f = loss_f self._verbose = not quiet # setup model if isinstance(model, nn.Module): self.model = model elif isinstance(model, dict): self.model = nn.ModuleDict(model) self.logger.debug(f"model is nn.ModuleDict of {self.model.keys()}") else: raise TypeError( f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}" ) if GPU in str(self.device): self.model.to(self.device) torch.backends.cudnn.benchmark = not disable_cudnn_benchmark self._cuda_nonblocking = not disable_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {not disable_cudnn_benchmark}, " f"cuda.nonblocking: {not disable_cuda_nonblocking}") else: self._cuda_nonblocking = False # usually, this is not expected self.logger.info( f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})" ) if is_distributed(): self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank]) self.logger.debug( f"model converted to DistributedDataParallel at rank={rank}") # self.accessible_model is useful for e.g., checkpointing if isinstance(self.model, nn.parallel.DistributedDataParallel) or isinstance( self.model, nn.DataParallel): self.accessible_model = self.model.module else: self.accessible_model = self.model # setup optimizer and scheduler self.optimizer = optimizer self.scheduler = scheduler self.set_optimizer() self.set_scheduler() if reporters is not None and not isinstance(reporters, Iterable): reporters = [reporters] reporters = reporters or [] if not any([isinstance(rep, TQDMReporter) for rep in reporters]): # if reporters not contain TQDMReporter reporters.append(TQDMReporter(ncols=tqdm_ncols)) self.logger.debug(f"reporter is ready: {reporters}") self.reporter = ReporterList(reporters) # called via property # _step and _epoch are set to -1 because they are incremented before each iteration and epoch self._step = -1 self._epoch = -1 self._is_train = True # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19) self._tqdm = lambda x: x if self._verbose: self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False) set_tqdm_stdout_stderr() self.logger.debug("verbose: setup tqdm") else: self.logger.debug("quiet: no tqdm") for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if isinstance(v, torch.Tensor): v = v.to(self.device) if isinstance(v, nn.Module): v.to(self.device) setattr(self, k, v) self.logger.debug(f"trainer sets {k} as a new attribute")
import importlib.util import os as python_os import subprocess import sys as python_sys from torch.cuda import device_count from homura.liblog import get_logger __all__ = [ "is_accimage_available", "is_apex_available", "is_distributed", "enable_accimage", "get_global_rank", "get_local_rank", "get_world_size", "get_num_nodes" ] logger = get_logger("homura.env") is_accimage_available = importlib.util.find_spec("accimage") is not None is_apex_available = importlib.util.find_spec("apex") is not None args = " ".join(python_sys.argv) is_distributed = "--local_rank" in args def _decode_bytes(b: bytes) -> str: return b.decode("ascii")[:-1] def get_git_hash() -> str: try: is_git_repo = subprocess.run( ["git", "rev-parse", "--is-inside-work-tree"],
def __init__(self, model: nn.Module or Dict[str, nn.Module], callbacks: Optional[Callback or Iterable[Callable]] = None, device: torch.device or str = None, use_cudnn_benchmark=True, use_cuda_nonblocking=False, logger: Optional[Logger] = None, **kwargs): self.logger = get_logger(__name__) if logger is None else logger if device is None: self.device = GPU if torch.cuda.is_available() else CPU else: self.device = device # set model(s) if isinstance(model, nn.Module): self.model = model self._is_single_model = True elif isinstance(model, dict): self.model = nn.ModuleDict(model) self._is_single_model = False else: raise TypeError( f"Unknown type for arg. model. Expected nn.Module or " f"Dict[str, Module] but got {type(model)}") if GPU in str(self.device): if use_cudnn_benchmark: torch.backends.cudnn.benchmark = True self.model.to(self.device) self._cuda_nonblocking = use_cuda_nonblocking self.logger.debug( f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, nonblocking: {use_cuda_nonblocking}" ) else: self._cuda_nonblocking = False self.logger.info("Running on CPU!") # set callback(s) if isinstance(callbacks, CallbackList): self._callbacks = callbacks elif isinstance(callbacks, Callback): self._callbacks = callbacks self.logger.debug( f"registered callback {callbacks.__class__.__name__}") elif isinstance(callbacks, Iterable): self._callbacks = CallbackList(*callbacks) elif callbacks is None: # if callback is not set self._callbacks = Callback() self.logger.debug(f"No callback registered") else: raise TypeError( f"type(callbacks) should not be {type(callbacks)}!") # set kwargs for k, v in kwargs.items(): if hasattr(self, k): raise AttributeError(f"{self} already has {k}") if torch.is_tensor(v): v.to(self.device) setattr(self, k, v)