Exemple #1
0
    def __init__(self, logger=None, save_dir=None, save_log: bool = False):
        super(LoggerWrapper, self).__init__(save_dir)
        from homura.liblog import get_logger, set_file_handler

        self.logger = get_logger(
            "homura.reporter") if logger is None else logger
        if save_log:
            set_file_handler(self._save_dir / "log.txt")
Exemple #2
0
    def __init__(self, iterator: Iterable, verb: bool = False):

        super(TQDMReporter, self).__init__()
        self.writer = tqdm.tqdm(iterator,
                                ncols=80) if is_master() else iterator
        self._verb = verb
        self._logger = liblog.get_logger(__name__)
        self._length = len(iterator)
        liblog._set_tqdm_handler()
Exemple #3
0
    def __init__(self, iterator: Iterable, verb: bool = False):

        super(TQDMReporter, self).__init__()
        self.writer = tqdm.tqdm(
            iterator, dynamic_ncols=True) if is_master() else iterator
        self._verb = verb
        self._logger = liblog.get_logger(__name__)
        self._length = len(iterator)
        self._max_accuracy = -1.0
        liblog._set_tqdm_handler()
        liblog._set_tqdm_print()
 def __init__(self,
              metric: Callable[[Mapping], Any],
              name: str,
              logger=None,
              no_reduce: bool = False):
     if metric is not None:
         self.metric_function = metric
     self.metric_name = name
     self._last_iter = {}
     self._last_epoch = {}
     self._metrics_history = {}
     self._logger = get_logger(__name__) if logger is None else logger
     self._warning_flag = True
     self._no_reduce = no_reduce
Exemple #5
0
    def __init__(self,
                 metric: Callable[[Mapping], Any],
                 name: str,
                 logger=None,
                 reduction="average",
                 no_reduce: bool = False):

        if metric is not None:
            self.metric_function = metric
        self.metric_name = name
        self._last_iter = {}
        self._last_epoch = {}
        self._metrics_history = {}
        self._logger = get_logger(__name__) if logger is None else logger
        self._no_reduce = no_reduce

        if reduction not in ("average", "sum"):
            raise RuntimeError(
                f"`reduction` should be 'average' or 'sum', but got {reduction} instead"
            )
        self.reduction = reduction
Exemple #6
0
    def __init__(self, model: nn.Module or Dict[str, nn.Module],
                 optimizer: Optional[Optimizer or Dict[str, Optimizer] or torch.optim.Optimizer],
                 loss_f: Optional[Callable or Dict[str, Callable]], *,
                 callbacks: Optional[Callback or Iterable[Callable]] = None,
                 scheduler: Optional[LRScheduler or Dict[LRScheduler]] = None,
                 update_scheduler_by_epoch: bool = True,
                 device: Optional[torch.device or str] = None,
                 verb=True, use_cudnn_benchmark=True, use_cuda_nonblocking=False, logger=None, **kwargs):

        if logger is None:
            logger = get_logger(__name__)
        super(TrainerBase, self).__init__(model, callbacks, device, use_cudnn_benchmark, use_cuda_nonblocking, logger,
                                          **kwargs)

        # set optimizer(s)
        if optimizer is None:
            self.optimizer = None
        elif isinstance(optimizer, Optimizer):
            self.optimizer = optimizer.set_model(self.model.parameters())
        elif isinstance(optimizer, torch.optim.Optimizer):
            self.optimizer = optimizer
        elif isinstance(optimizer, dict):
            if not isinstance(model, dict):
                raise TypeError(f"model is not dict but optimizer is dict!")
            self.optimizer = StepDict(torch.optim.Optimizer)
            # self.model is nn.ModuleDict, then self.optimizer is StepDict
            for k, opt in optimizer.items():
                m = self.model._modules.get(k)
                if m is None:
                    raise KeyError(f"No such key {k} in model!")
                if opt is None:
                    self.optimizer[k] = None
                elif isinstance(opt, Optimizer):
                    self.optimizer[k] = opt.set_model(m.parameters())
                else:
                    raise TypeError(f"Unknown type: {type(opt)}")
        else:
            raise TypeError(f"Unknown type: {type(optimizer)}")
        self.logger.debug(f"Use optimizer: {self.optimizer.__class__.__name__}")

        # set scheduler(s)
        self.update_scheduler_by_epoch = update_scheduler_by_epoch
        self.update_scheduler(scheduler, update_scheduler_by_epoch)

        self.logger.debug(f"Use scheduler: {self.scheduler.__class__.__name__}")

        self.loss_f = loss_f
        self._verb = verb

        # called via property
        # _step and _epoch are set to -1 because they are incremented before each iteration and epoch!
        self._step = -1
        self._epoch = -1
        self._is_train = True

        _map_base = {MODEL: self.model,
                     OPTIMIZER: self.optimizer,
                     SCHEDULER: self.scheduler,
                     TRAINER: self}
        self._iteration_map = Map(**_map_base.copy())
        self._epoch_map = Map(**_map_base.copy())
        self._all_map = Map(**_map_base.copy())

        self._callbacks.before_all(self._all_map)
Exemple #7
0
from __future__ import annotations

import torch
from torch import Tensor

from homura.liblog import get_logger

logger = get_logger(__name__)

__all__ = [
    "true_positive", "true_negative", "false_positive", "false_negative",
    "classwise_accuracy", "precision", "recall", "specificity", "f1_score",
    "confusion_matrix", "accuracy"
]


def _base(input: Tensor, target: Tensor) -> tuple[Tensor, Tensor, Tensor]:
    classes = torch.arange(input.size(1), device=input.device)
    pred = input.argmax(dim=1).view(-1, 1)
    target = target.view(-1, 1)
    return pred, target, classes


def true_positive(input: Tensor, target: Tensor) -> Tensor:
    """Calculate true positive

    :param input: output of network, expected to be `BxCx(OPTIONAL DIMENSIONS)`
    :param target: target, expected to be `Bx(OPTIONAL DIMENSIONS)`
    :return: true positive in float tensor of `C`
    """
Exemple #8
0
    def __init__(self,
                 model: nn.Module or Dict[str, nn.Module],
                 optimizer: Optional[Partial or Optimizer
                                     or Dict[str, Optimizer]],
                 loss_f: Optional[Callable or Dict[str, Callable]],
                 *,
                 callbacks: Optional[Iterable[Callback]] = None,
                 scheduler: Optional[Partial or Scheduler
                                     or Dict[str, Scheduler]] = None,
                 update_scheduler_by_epoch: bool = True,
                 device: Optional[torch.device or str] = None,
                 verb: bool = True,
                 use_cudnn_benchmark: bool = True,
                 use_cuda_nonblocking: bool = False,
                 use_horovod: bool = False,
                 logger=None,
                 use_sync_bn: bool = False,
                 tqdm_ncols: int = 80,
                 **kwargs):

        if logger is None:
            logger = get_logger(__name__)
        self.logger = logger

        if device is None:
            self.device = torch.device(
                GPU) if torch.cuda.is_available() else torch.device(CPU)
        else:
            self.device = device

        if use_horovod and not is_horovod_available():
            raise RuntimeError('horovod is not available!')

        if is_distributed():
            if use_sync_bn:
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

            rank = get_local_rank()
            torch.cuda.set_device(rank)
            if get_global_rank() > 0:
                # to avoid overwriting
                verb = False

        if isinstance(model, nn.Module):
            self.model = model
        elif isinstance(model, dict):
            self.model = nn.ModuleDict(model)
        else:
            raise TypeError(
                f"Unknown type for `model`. Expected nn.Module or Dict[str, Module] but got {type(model)}"
            )

        if GPU in str(self.device):
            self.model.to(self.device)
            torch.backends.cudnn.benchmark = use_cudnn_benchmark
            self._cuda_nonblocking = use_cuda_nonblocking
            self.logger.debug(
                f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, "
                f"cuda.nonblocking: {use_cuda_nonblocking}")
        else:
            self._cuda_nonblocking = False
            # usually, this is not expected
            self.logger.info(
                f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})"
            )

        if not use_horovod and is_distributed():
            self.model = nn.parallel.DistributedDataParallel(self.model,
                                                             device_ids=[rank])

        if isinstance(self.model,
                      nn.parallel.DistributedDataParallel) or isinstance(
                          self.model, nn.DataParallel):
            self.accessible_model = self.model.module
        else:
            self.accessible_model = self.model

        self.optimizer = None
        self.scheduler = None
        self._callbacks = None
        self.update_scheduler_by_epoch = update_scheduler_by_epoch
        self._set_optimizer(optimizer)
        self._set_scheduler(scheduler)
        self._set_callbacks(callbacks)

        if use_horovod:
            import horovod.torch as hvd

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(self.optimizer, root_rank=0)
            self.optimizer = hvd.DistributedOptimizer(
                self.optimizer, named_parameters=self.model.named_parameters())

        self.loss_f = loss_f
        self._verb = verb

        # called via property
        # _step and _epoch are set to -1 because they are incremented before each iteration and epoch!
        self._step = -1
        self._epoch = -1
        self._is_train = True
        # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19)
        self._tqdm = Partial(tqdm, ncols=tqdm_ncols,
                             leave=False) if verb else lambda x: x

        _map_base = {
            MODEL: self.accessible_model,
            OPTIMIZER: self.optimizer,
            SCHEDULER: self.scheduler,
            TRAINER: self
        }
        self._iteration_map = Map(**_map_base.copy())
        self._epoch_map = Map(**_map_base.copy())
        self._all_map = Map(**_map_base.copy())

        for k, v in kwargs.items():
            if hasattr(self, k):
                raise AttributeError(f"{self} already has {k}")
            if torch.is_tensor(v):
                v = v.to(self.device)
            if isinstance(v, nn.Module):
                v.to(self.device)
            setattr(self, k, v)

        self._callbacks.before_all(self._all_map)
Exemple #9
0
""" Helper functions to get information about the environment.
"""

import importlib.util
import os as python_os
import subprocess
import sys as python_sys
from typing import Any, Optional

from homura.liblog import get_logger

logger = get_logger("homura.environment")


# Utility functions that useful libraries are available or not
def is_accimage_available() -> bool:
    return importlib.util.find_spec("accimage") is not None


def enable_accimage() -> None:
    if is_accimage_available():
        import torchvision

        torchvision.set_image_backend("accimage")
        logger.info("accimage is activated")
    else:
        logger.warning("accimage is not available")


def is_faiss_available() -> bool:
    _faiss_available = importlib.util.find_spec("faiss") is not None
Exemple #10
0
    def __init__(self,
                 model: nn.Module or Dict[str, nn.Module],
                 optimizer: Optional[Partial or Optimizer
                                     or Dict[str, Optimizer]],
                 loss_f: Optional[Callable or Dict[str, Callable]] = None,
                 *,
                 reporters: Optional[_ReporterBase
                                     or List[_ReporterBase]] = None,
                 scheduler: Optional[Partial or Scheduler
                                     or Dict[str, Scheduler]] = None,
                 update_scheduler_by_epoch: bool = True,
                 device: Optional[torch.device or str] = None,
                 verb: bool = True,
                 use_cudnn_benchmark: bool = True,
                 use_cuda_nonblocking: bool = False,
                 use_horovod: bool = False,
                 logger=None,
                 use_sync_bn: bool = False,
                 tqdm_ncols: int = 80,
                 **kwargs):

        if kwargs.get("callbacks"):
            raise DeprecationWarning(
                "callback is deprecated, if you need, use homura before v2020.8"
            )

        self.logger = logger or get_logger(__name__)

        self.device = device or (torch.device(
            GPU) if torch.cuda.is_available() else torch.device(CPU))

        # setup for distributed
        self._use_sync_bn = use_sync_bn
        if is_distributed():
            if self._use_sync_bn:
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
                self.logger.info(
                    "BNs of model are converted to nn.SyncBatchNorm")

            rank = get_local_rank()
            torch.cuda.set_device(rank)
            if get_global_rank() > 0:
                # to avoid overwriting
                verb = False

        # setup model
        if isinstance(model, nn.Module):
            self.model = model
        elif isinstance(model, dict):
            self.model = nn.ModuleDict(model)
        else:
            raise TypeError(
                f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}"
            )

        if GPU in str(self.device):
            self.model.to(self.device)
            torch.backends.cudnn.benchmark = use_cudnn_benchmark
            self._cuda_nonblocking = use_cuda_nonblocking
            self.logger.debug(
                f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, "
                f"cuda.nonblocking: {use_cuda_nonblocking}")
        else:
            self._cuda_nonblocking = False
            # usually, this is not expected
            self.logger.info(
                f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})"
            )

        if not use_horovod and is_distributed():
            self.model = nn.parallel.DistributedDataParallel(self.model,
                                                             device_ids=[rank])

        # self.accessible_model is useful for e.g., checkpointing
        if isinstance(self.model,
                      nn.parallel.DistributedDataParallel) or isinstance(
                          self.model, nn.DataParallel):
            self.accessible_model = self.model.module
        else:
            self.accessible_model = self.model

        self.loss_f = loss_f
        self._verb = verb

        # setup optimizer and scheduler
        self.optimizer = optimizer
        self.scheduler = scheduler
        self._update_scheduler_by_epoch = update_scheduler_by_epoch
        self.set_optimizer()
        self.set_scheduler()

        if use_horovod:
            if not is_horovod_available():
                raise RuntimeError("horovod is not available!")
            import horovod.torch as hvd

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(self.optimizer, root_rank=0)
            self.optimizer = hvd.DistributedOptimizer(
                self.optimizer, named_parameters=self.model.named_parameters())

        if reporters is not None and not isinstance(reporters, Iterable):
            reporters = [reporters]
        reporters = reporters or []

        if not any([isinstance(rep, TQDMReporter) for rep in reporters]):
            # if reporters not contain TQDMReporter
            reporters.append(TQDMReporter(ncols=tqdm_ncols))
        self.reporter = ReporterList(reporters)

        # called via property
        # _step and _epoch are set to -1 because they are incremented before each iteration and epoch
        self._step = -1
        self._epoch = -1
        self._is_train = True

        # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19)
        self._tqdm = lambda x: x
        if verb:
            self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False)
            _set_tqdm_print()

        for k, v in kwargs.items():
            if hasattr(self, k):
                raise AttributeError(f"{self} already has {k}")
            if torch.is_tensor(v):
                v = v.to(self.device)
            if isinstance(v, nn.Module):
                v.to(self.device)
            setattr(self, k, v)
            self.logger.debug(f"trainer sets {k} as a new attribute")
Exemple #11
0
""" Helper functions to make distributed training easy
"""

import builtins
import os as python_os
import warnings
from functools import wraps
from typing import Callable, Optional

from torch import distributed
from torch.cuda import device_count

from homura.liblog import get_logger
from .environment import get_args, get_environ

logger = get_logger("homura.distributed")
original_print = builtins.print


def is_horovod_available() -> bool:
    warnings.warn("horovod is no longer supported by homura", DeprecationWarning)
    return False


def is_distributed_available() -> bool:
    return distributed.is_available()


def is_distributed() -> bool:
    """ Check if the process is distributed by checking the world size is larger than 1.
    """
Exemple #12
0
from functools import partial
from itertools import cycle
from statistics import median
from typing import Tuple, Mapping, Callable, Optional

import torch
from homura import optim, trainers, reporters, callbacks, Map, get_args, lr_scheduler
from homura.liblog import get_logger
from homura.modules import exponential_moving_average_, to_onehot
from torch import nn
from torch.nn import functional as F

from backends.data import get_dataloader
from backends.wrn import wrn28_2

logger = get_logger(__file__)


class PackedLoader(object):
    def __init__(self, trusted_loader, untrusted_loader):
        self.l_loaders = trusted_loader
        self.u_loaders = untrusted_loader
        self._size = len(untrusted_loader)

    def __len__(self):
        return self._size

    def __iter__(self):
        for l, u in zip(cycle(self.l_loaders), self.u_loaders):
            yield list(l) + list(u)
Exemple #13
0
    def __init__(self,
                 model: nn.Module or Dict[str, nn.Module],
                 optimizer: Optional[Partial or Optimizer
                                     or Dict[str, Optimizer]],
                 loss_f: Optional[Callable or Dict[str, Callable]] = None,
                 *,
                 reporters: Optional[_ReporterBase
                                     or List[_ReporterBase]] = None,
                 scheduler: Optional[Partial or Scheduler
                                     or Dict[str, Scheduler]] = None,
                 device: Optional[torch.device or str] = None,
                 quiet: bool = True,
                 disable_cudnn_benchmark: bool = False,
                 disable_cuda_nonblocking: bool = False,
                 logger=None,
                 use_sync_bn: bool = False,
                 tqdm_ncols: int = 120,
                 debug: bool = False,
                 **kwargs):

        if kwargs.get("update_scheduler_by_epoch"):
            raise DeprecationWarning(
                "update_scheduler_by_epoch is deprecated, users need to step")

        if kwargs.get("callbacks"):
            raise DeprecationWarning(
                "callback is deprecated, if you need, use homura before v2020.8"
            )

        self.logger = logger or get_logger(__name__)
        self.device = device or (torch.device(
            GPU) if torch.cuda.is_available() else torch.device(CPU))
        self._is_debug = debug

        if self._is_debug:
            self.logger.warning(
                "Trainer is set to be debug mode, which may affect the performance"
            )
            set_verb_level("debug")

        # setup for distributed
        self._use_sync_bn = use_sync_bn
        if is_distributed():
            if self._use_sync_bn:
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
                self.logger.info(
                    "BNs of model are converted to nn.SyncBatchNorm")

            rank = get_local_rank()
            torch.cuda.set_device(rank)
            if get_global_rank() > 0:
                # to avoid overwriting
                quiet = True

        self.loss_f = loss_f
        self._verbose = not quiet

        # setup model
        if isinstance(model, nn.Module):
            self.model = model
        elif isinstance(model, dict):
            self.model = nn.ModuleDict(model)
            self.logger.debug(f"model is nn.ModuleDict of {self.model.keys()}")
        else:
            raise TypeError(
                f"Unknown type for `model`. Expected nn.Module or Dict[str, Module], but got {type(model)}"
            )

        if GPU in str(self.device):
            self.model.to(self.device)
            torch.backends.cudnn.benchmark = not disable_cudnn_benchmark
            self._cuda_nonblocking = not disable_cuda_nonblocking
            self.logger.debug(
                f"cuda: True, cudnn.benchmark: {not disable_cudnn_benchmark}, "
                f"cuda.nonblocking: {not disable_cuda_nonblocking}")
        else:
            self._cuda_nonblocking = False
            # usually, this is not expected
            self.logger.info(
                f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})"
            )

        if is_distributed():
            self.model = nn.parallel.DistributedDataParallel(self.model,
                                                             device_ids=[rank])
            self.logger.debug(
                f"model converted to DistributedDataParallel at rank={rank}")

        # self.accessible_model is useful for e.g., checkpointing
        if isinstance(self.model,
                      nn.parallel.DistributedDataParallel) or isinstance(
                          self.model, nn.DataParallel):
            self.accessible_model = self.model.module
        else:
            self.accessible_model = self.model

        # setup optimizer and scheduler
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.set_optimizer()
        self.set_scheduler()

        if reporters is not None and not isinstance(reporters, Iterable):
            reporters = [reporters]
        reporters = reporters or []

        if not any([isinstance(rep, TQDMReporter) for rep in reporters]):
            # if reporters not contain TQDMReporter
            reporters.append(TQDMReporter(ncols=tqdm_ncols))
        self.logger.debug(f"reporter is ready: {reporters}")
        self.reporter = ReporterList(reporters)

        # called via property
        # _step and _epoch are set to -1 because they are incremented before each iteration and epoch
        self._step = -1
        self._epoch = -1
        self._is_train = True

        # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19)
        self._tqdm = lambda x: x
        if self._verbose:
            self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False)
            set_tqdm_stdout_stderr()
            self.logger.debug("verbose: setup tqdm")
        else:
            self.logger.debug("quiet: no tqdm")

        for k, v in kwargs.items():
            if hasattr(self, k):
                raise AttributeError(f"{self} already has {k}")
            if isinstance(v, torch.Tensor):
                v = v.to(self.device)
            if isinstance(v, nn.Module):
                v.to(self.device)
            setattr(self, k, v)
            self.logger.debug(f"trainer sets {k} as a new attribute")
Exemple #14
0
import importlib.util
import os as python_os
import subprocess
import sys as python_sys

from torch.cuda import device_count

from homura.liblog import get_logger

__all__ = [
    "is_accimage_available", "is_apex_available", "is_distributed",
    "enable_accimage", "get_global_rank", "get_local_rank", "get_world_size",
    "get_num_nodes"
]

logger = get_logger("homura.env")
is_accimage_available = importlib.util.find_spec("accimage") is not None
is_apex_available = importlib.util.find_spec("apex") is not None

args = " ".join(python_sys.argv)
is_distributed = "--local_rank" in args


def _decode_bytes(b: bytes) -> str:
    return b.decode("ascii")[:-1]


def get_git_hash() -> str:
    try:
        is_git_repo = subprocess.run(
            ["git", "rev-parse", "--is-inside-work-tree"],
Exemple #15
0
    def __init__(self,
                 model: nn.Module or Dict[str, nn.Module],
                 callbacks: Optional[Callback or Iterable[Callable]] = None,
                 device: torch.device or str = None,
                 use_cudnn_benchmark=True,
                 use_cuda_nonblocking=False,
                 logger: Optional[Logger] = None,
                 **kwargs):

        self.logger = get_logger(__name__) if logger is None else logger
        if device is None:
            self.device = GPU if torch.cuda.is_available() else CPU
        else:
            self.device = device

        # set model(s)
        if isinstance(model, nn.Module):
            self.model = model
            self._is_single_model = True
        elif isinstance(model, dict):
            self.model = nn.ModuleDict(model)
            self._is_single_model = False
        else:
            raise TypeError(
                f"Unknown type for arg. model. Expected nn.Module or "
                f"Dict[str, Module] but got {type(model)}")

        if GPU in str(self.device):
            if use_cudnn_benchmark:
                torch.backends.cudnn.benchmark = True
            self.model.to(self.device)
            self._cuda_nonblocking = use_cuda_nonblocking
            self.logger.debug(
                f"cuda: True, cudnn.benchmark: {use_cudnn_benchmark}, nonblocking: {use_cuda_nonblocking}"
            )
        else:
            self._cuda_nonblocking = False
            self.logger.info("Running on CPU!")

        # set callback(s)
        if isinstance(callbacks, CallbackList):
            self._callbacks = callbacks
        elif isinstance(callbacks, Callback):
            self._callbacks = callbacks
            self.logger.debug(
                f"registered callback {callbacks.__class__.__name__}")
        elif isinstance(callbacks, Iterable):
            self._callbacks = CallbackList(*callbacks)
        elif callbacks is None:
            # if callback is not set
            self._callbacks = Callback()
            self.logger.debug(f"No callback registered")
        else:
            raise TypeError(
                f"type(callbacks) should not be {type(callbacks)}!")

        # set kwargs
        for k, v in kwargs.items():
            if hasattr(self, k):
                raise AttributeError(f"{self} already has {k}")
            if torch.is_tensor(v):
                v.to(self.device)
            setattr(self, k, v)